# Read in TRAP predictoins
FullDataset=fread("TRAPP/DATA_V3_SARS2_ONLY/pred_scores/prott5_xl_bfd_peptides_sars2_mut_test_prediction_withlabel_2.txt")%>% distinct()
# Read in information supplied to TRAP, to link pMHC with score. This includes all pMHC generated via in silico mutagenesis
INPUT_DATA_TRAPP=fread("MUTAGENESIS_SUBSET_OMI_WUHAN_HLA_RUN_CHL_DNN_V4_INC_SUBVAR_WTNB_MTBINDER_ALLCOV2_MUTAGENESIS.txt")%>%
distinct()%>% mutate(BA_Rank = as.numeric(BA_Rank))%>%
mutate(nlog2Rank = - log2(BA_Rank))
INPUT_DATA_TRAPP=INPUT_DATA_TRAPP %>% mutate(nlog2Rank = round(nlog2Rank, digits=4))
FullDataset=FullDataset%>% mutate(nlog2Rank = round(nlog2Rank, digits=4)) # Use nlog2Rank to link pMHC with TRAP output score
FullDataset=FullDataset %>% inner_join(INPUT_DATA_TRAPP)
## Joining, by = c("Peptide", "nlog2Rank", "Dataset", "Predicted_Binding")
# Cleaning
FullDataset %>% select(Dataset)%>% table
## .
## Mutagenesis OmicronTest SUBVAR_MT_TEST SUBVAR_WT_TEST WuhanTest
## 86984 431 324 324 477
FullDataset=FullDataset%>%dplyr::rename(Prediction=prediction)
FullDataset=FullDataset %>% select(Peptide, Dataset, Predicted_Binding, Prediction)
FullDataset=FullDataset %>% filter(Dataset == "Mutagenesis")
#FullDataset %>% filter(Peptide %in% MUTATIONS$VariantAlignment)
#FullDataset %>% filter(Peptide %in% "SHRRARSVA")
FullDataset %>% select(Predicted_Binding) %>% table()
## .
## HLA-A0101 HLA-A0201 HLA-A0202 HLA-A0206 HLA-A0211 HLA-A0301 HLA-A0302 HLA-A1101
## 2258 1355 1339 1610 1467 1373 1374 1497
## HLA-A2301 HLA-A2402 HLA-A2501 HLA-A2601 HLA-A2902 HLA-A3001 HLA-A3002 HLA-A3101
## 1724 1794 1371 1533 2355 935 2161 1091
## HLA-A3201 HLA-A3303 HLA-A6801 HLA-A6802 HLA-A7401 HLA-B0702 HLA-B0801 HLA-B1402
## 1411 1021 1420 750 1335 935 862 1109
## HLA-B1501 HLA-B1503 HLA-B1801 HLA-B2705 HLA-B3501 HLA-B3503 HLA-B3801 HLA-B4001
## 1680 1718 718 281 1819 1002 764 639
## HLA-B4002 HLA-B4006 HLA-B4201 HLA-B4402 HLA-B4403 HLA-B4501 HLA-B5001 HLA-B5101
## 407 270 882 887 870 248 387 754
## HLA-B5201 HLA-B5301 HLA-B5701 HLA-B5801 HLA-B5802 HLA-C0102 HLA-C0202 HLA-C0210
## 1072 765 743 790 1039 2081 2185 2185
## HLA-C0302 HLA-C0303 HLA-C0304 HLA-C0401 HLA-C0501 HLA-C0602 HLA-C0701 HLA-C0702
## 1990 1593 1593 2022 1184 1445 1892 2488
## HLA-C0801 HLA-C0802 HLA-C1202 HLA-C1203 HLA-C1402 HLA-C1502 HLA-C1601 HLA-C1701
## 1812 1395 2143 1948 2552 1203 1763 1695
# Class I OMICRON MUTANTS
TEST_DATA_LOCATION = "NETMHCPAN_CLASSI_MUTAGENESIS/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")
data3 <- data_frame(file = files) %>%
mutate(file_contents = map(file,
~ fread(file.path(data_path, .),skip = 1))
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% select(! c(file,Pos,ID,core,icore)) %>% mutate(Binder = ifelse(NB==1,"BINDER","NONBINDER"))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=HLA_Allele,pattern = "\\:",replacement = ""))
NETMHC_CLASSI=Netmhcpanres
NETMHC_CLASSI=NETMHC_CLASSI %>% mutate(MT_nM_41 = round(50000^(1-`BA-score`) ,digits=5))
NETMHC_CLASSI=NETMHC_CLASSI %>% mutate(Binder = ifelse(BA_Rank>=2, "NONBINDER","BINDER"))
#NETMHC_CLASSI = NETMHC_CLASSI %>% filter(Binder == "BINDER")
NETMHC_CLASSI %>% head
## # A tibble: 6 × 10
## Peptide `EL-score` EL_Rank `BA-score` BA_Rank Ave NB HLA_Allele Binder
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <chr> <chr>
## 1 AASHNIALIW 0.005 7.24 0.0896 8.15 0.005 0 HLA-A0101 NONBI…
## 2 ACSHNIALIW 0.0032 9.36 0.0788 10.4 0.0032 0 HLA-A0101 NONBI…
## 3 ADSHNIALIW 0.0052 7.08 0.0843 9.16 0.0052 0 HLA-A0101 NONBI…
## 4 AESHNIALIW 0.0046 7.61 0.0792 10.4 0.0046 0 HLA-A0101 NONBI…
## 5 AFSHNIALIW 0.003 9.70 0.0857 8.87 0.003 0 HLA-A0101 NONBI…
## 6 AGSHNIALIW 0.003 9.68 0.0742 11.7 0.003 0 HLA-A0101 NONBI…
## # … with 1 more variable: MT_nM_41 <dbl>
NETMHC_CLASSI %>% select(Peptide) %>% distinct()%>% nrow
## [1] 12362
MUTAGENESIS_BINDERS=NETMHC_CLASSI %>% select(Peptide, HLA_Allele, BA_Rank,Binder)%>% mutate(Dataset="Mutagenesis")%>% distinct()
MUTAGENESIS_BINDERS=MUTAGENESIS_BINDERS%>% dplyr::rename(Predicted_Binding=HLA_Allele, AASeq2=Peptide, MT_Binder=Binder, MT_BA_Rank = BA_Rank)
NETMHC_CLASSI %>% filter(Peptide %in% "EELKKLLEQW")
## # A tibble: 64 × 10
## Peptide `EL-score` EL_Rank `BA-score` BA_Rank Ave NB HLA_Allele Binder
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <chr> <chr>
## 1 EELKKLL… 0.0021 12.0 0.0297 51.9 0.0021 0 HLA-A0101 NONBI…
## 2 EELKKLL… 0.0002 34.8 0.0175 77.8 0.0002 0 HLA-A0201 NONBI…
## 3 EELKKLL… 0.0005 33.5 0.0353 67.5 0.0005 0 HLA-A0202 NONBI…
## 4 EELKKLL… 0.0005 33.9 0.0269 76.9 0.0005 0 HLA-A0206 NONBI…
## 5 EELKKLL… 0.0002 47.4 0.0206 78.6 0.0002 0 HLA-A0211 NONBI…
## 6 EELKKLL… 0.0001 31.7 0.0197 76.6 0.0001 0 HLA-A0301 NONBI…
## 7 EELKKLL… 0.0002 32 0.0202 72.4 0.0002 0 HLA-A0302 NONBI…
## 8 EELKKLL… 0.0001 30.2 0.0206 69.7 0.0001 0 HLA-A1101 NONBI…
## 9 EELKKLL… 0.0037 6.52 0.0709 18.1 0.0037 0 HLA-A2301 NONBI…
## 10 EELKKLL… 0.0022 7.97 0.0566 18.1 0.0022 0 HLA-A2402 NONBI…
## # … with 54 more rows, and 1 more variable: MT_nM_41 <dbl>
MUTATIONS = readRDS("OMICRON_EPITOPE_MUTATIONS.rds")
WUHAN_PEPTIDES_BINDING_SCORES=readRDS("ORIGINAL_PEPTIDES_BINDING_SCORES_V2.rds")
WUHAN_PEPTIDES_BINDING_SCORES=WUHAN_PEPTIDES_BINDING_SCORES %>% filter(Peptide %in% MUTATIONS$Peptide)
WUHAN_PEPTIDES_BINDING_SCORES %>% head
## # A tibble: 6 × 15
## Peptide Predicted_Binding `Thalf(h)` `EL-score` EL_Rank `BA-score` BA_Rank
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 AEHVNNSY HLA-A0101,HLA-A0… 0.2205,0.1… 0.0529,0,… 1.914,… 0.0989,0.… 6.7589…
## 2 AKSHNIALIW HLA-A0101,HLA-A0… 0.2148,0.1… 0.0028,1e… 10.094… 0.0679,0.… 13.890…
## 3 APGQTGKIA HLA-A0101,HLA-A0… 0.0831,0.0… 2e-04,0,1… 40.307… 0.0146,0.… 86.212…
## 4 APRITFGGP HLA-A0101,HLA-A0… 0.0697,0.0… 1e-04,0,0… 68.333… 0.0161,0.… 82.743…
## 5 CNDPFLGVY HLA-A0101,HLA-A0… 0.3908,0.1… 0.3985,1e… 0.3936… 0.4964,0.… 0.1649…
## 6 CNDPFLGVYY HLA-A0101,HLA-A0… 0.5458,0.1… 0.4072,0,… 0.3849… 0.509,0.0… 0.1504…
## # … with 8 more variables: Ave <chr>, Binder <chr>, nM_41 <chr>, nM <chr>,
## # Aff_Binder <chr>, Length <int>, HydrophobicCount <int>, HydroFraction <dbl>
# cleaning
WUHAN_PEPTIDES_BINDING_SCORES=WUHAN_PEPTIDES_BINDING_SCORES %>% select(Peptide, Predicted_Binding, BA_Rank, Binder)%>%
separate_rows_(cols = c("Predicted_Binding","BA_Rank","Binder"),sep=",")%>% mutate(Dataset = "WuhanTest")
WUHAN_PEPTIDES_BINDING_SCORES=WUHAN_PEPTIDES_BINDING_SCORES %>% dplyr::rename(AASeq1=Peptide)%>% select(!Dataset)
OMI_PEPTIDES_BINDING_SCORES=readRDS("OMICRON_PEPTIDES_BINDING_SCORES_V2.rds")
OMI_PEPTIDES_BINDING_SCORES %>% head
## # A tibble: 6 × 13
## VariantAlignment MT_Predicted_Bind… `MT_Thalf(h)` `MT_EL-score` MT_EL_Rank
## <chr> <chr> <chr> <chr> <chr>
## 1 AEYVNNSY HLA-A0101,HLA-A02… 0.2295,0.1438,… 0.0412,0,1e-0… 2.2565,55.…
## 2 AKSHNITLIW HLA-A0101,HLA-A02… 0.2395,0.1924,… 0.0048,1e-04,… 7.3782,47.…
## 3 ALRITFGGP HLA-A0101,HLA-A02… 0.0843,0.1071,… 0,9e-04,0.004… 73.4615,18…
## 4 APGQTGNIA HLA-A0101,HLA-A02… 0.0906,0.1022,… 5e-04,0,1e-04… 29.3103,52…
## 5 CNDPFLDHK HLA-A0101,HLA-A02… 0.1508,0.093,0… 0.0105,0,1e-0… 4.7481,63.…
## 6 CNDPFLDHKN HLA-A0101,HLA-A02… 0.0855,0.0757,… 2e-04,0,0,0,0… 46,90,95,9…
## # … with 8 more variables: MT_BA-score <chr>, MT_BA_Rank <chr>, MT_Ave <chr>,
## # MT_Binder <chr>, MT_nM_41 <chr>, Predicted_Binding <chr>, MT_nM <chr>,
## # MT_Aff_Binder <chr>
OMI_PEPTIDES_BINDING_SCORES=OMI_PEPTIDES_BINDING_SCORES %>% select(VariantAlignment, Predicted_Binding, MT_BA_Rank, MT_Binder)%>%
separate_rows_(cols = c("Predicted_Binding","MT_BA_Rank","MT_Binder"),sep=",")%>%
dplyr::rename(AASeq2=VariantAlignment)%>%
mutate(Dataset = "OmicronTest")
OMI_PEPTIDES_BINDING_SCORES %>% filter(AASeq2 == "SHRRARSVA")
## # A tibble: 64 × 5
## AASeq2 Predicted_Binding MT_BA_Rank MT_Binder Dataset
## <chr> <chr> <chr> <chr> <chr>
## 1 SHRRARSVA HLA-A0101 63.4561 NONBINDER OmicronTest
## 2 SHRRARSVA HLA-A0201 81.6515 NONBINDER OmicronTest
## 3 SHRRARSVA HLA-A0202 77.2365 NONBINDER OmicronTest
## 4 SHRRARSVA HLA-A0206 83.1944 NONBINDER OmicronTest
## 5 SHRRARSVA HLA-A0211 78.4834 NONBINDER OmicronTest
## 6 SHRRARSVA HLA-A0301 34.8778 NONBINDER OmicronTest
## 7 SHRRARSVA HLA-A0302 53.217 NONBINDER OmicronTest
## 8 SHRRARSVA HLA-A1101 52.194 NONBINDER OmicronTest
## 9 SHRRARSVA HLA-A2301 42.1591 NONBINDER OmicronTest
## 10 SHRRARSVA HLA-A2402 37.9478 NONBINDER OmicronTest
## # … with 54 more rows
# Mutant data consists of the Mutagenesis set + the Omicron epitope.
MT_DATA=MUTAGENESIS_BINDERS %>% rbind(OMI_PEPTIDES_BINDING_SCORES) %>% select(!Dataset)%>% distinct()
# WT data consists of the Wuhan set
WT_DATA = WUHAN_PEPTIDES_BINDING_SCORES
#FullDataset %>% mutate(Type = ifelse)
# Read in 'INSILICO_MUTAGENESIS_WUHANOMICRON' - this links WT to each respective mutant.
# Filter for substitutions and 9/10mers
INSILICO_MUTAGENESIS_WUHANOMICRON=readRDS("INSILICO_MUTAGENESIS_WUHANOMICRON.rds") %>% filter(Mut_type == "Substitution")
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% mutate(Length = nchar(AASeq1))%>% filter(Length %in% c(9,10))
# Inner join all the data. So for each row we have WTPeptide, MTPeptide, HLA, Rank scores for WT and MT.
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% left_join(MT_DATA)%>% drop_na()
## Joining, by = "AASeq2"
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% left_join(WUHAN_PEPTIDES_BINDING_SCORES)%>% drop_na()
## Joining, by = c("AASeq1", "Predicted_Binding")
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON%>% filter(! (Binder == "NONBINDER" & MT_Binder == "NONBINDER"))
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% select(order(colnames(INSILICO_MUTAGENESIS_WUHANOMICRON)))
# AASeq2 is the mutant peptide.
FullDataset=FullDataset %>% dplyr::rename(AASeq2=Peptide)
ANTIJOIN_SET=INSILICO_MUTAGENESIS_WUHANOMICRON %>% anti_join(FullDataset)
## Joining, by = c("AASeq2", "Predicted_Binding")
# Join by column AASeq2. Prediction here is the TRAP score for mutant peptides.
INSILICO_MUTAGENESIS_WUHANOMICRON = INSILICO_MUTAGENESIS_WUHANOMICRON%>% inner_join(FullDataset)
## Joining, by = c("AASeq2", "Predicted_Binding")
ANTIJOIN_SET%>% mutate(GROUP = paste0(Binder, "_",MT_Binder))%>% select(GROUP) %>% table
## .
## BINDER_BINDER BINDER_NONBINDER NONBINDER_BINDER
## 215 28581 12
# Deal w binder binder. Somehow theyrer not in fulldataset, maybe we can't link them directly by matching nlog2Rank.
# So find the scores and create the data
BINDER_BINDER=ANTIJOIN_SET %>% mutate(GROUP = paste0(Binder, "_",MT_Binder)) %>% filter(GROUP=="BINDER_BINDER")
BINDER_BINDER=BINDER_BINDER %>%
left_join(fread("TRAPP/DATA_V3_SARS2_ONLY/pred_scores/prott5_xl_bfd_peptides_sars2_mut_test_prediction_withlabel_2.txt")%>%
distinct()%>%
dplyr::rename(AASeq2=Peptide)) %>%
select(AASeq1, AASeq2, BA_Rank, Binder, Length, MT_BA_Rank, MT_Binder, Mut_type, Predicted_Binding, prediction) %>%
dplyr::rename(Prediction=prediction)%>% distinct()
## Joining, by = c("AASeq2", "Predicted_Binding")
# Deal w/ binder non-binder: impute a TRAP score of 0.01 for the MT.
BINDER_NONBINDER = ANTIJOIN_SET%>% mutate(GROUP = paste0(Binder, "_",MT_Binder)) %>% filter(GROUP=="BINDER_NONBINDER")
BINDER_NONBINDER = BINDER_NONBINDER %>% select(! GROUP) %>% mutate(Prediction=0.01)
BINDER_NONBINDER%>% select(AASeq2, MT_BA_Rank, Predicted_Binding) %>%
dplyr::rename(Peptide=AASeq2, BA_Rank=MT_BA_Rank)%>% distinct() %>%
readr::write_csv(file="NONBINDERS_FOR_TRAPP_MUTAGENESIS.csv")
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% select(! Dataset) %>%
distinct() %>% rbind(BINDER_BINDER, BINDER_NONBINDER)%>%
mutate(AASeq2_Prediction= Prediction)
# FullDataset now contains the WT TRAP score.
FullDataset=fread("TRAPP/DATA_V3_SARS2_ONLY/pred_scores/prott5_xl_bfd_peptides_sars2_mut_test_prediction_withlabel_2.txt")%>% distinct()
# Link to the specific pMHC via nlog2Rank. This is done because TRAP output lacks the particular MHC for which the observation was generated.
INPUT_DATA_TRAPP=fread("MUTAGENESIS_SUBSET_OMI_WUHAN_HLA_RUN_CHL_DNN_V4_INC_SUBVAR_WTNB_MTBINDER_ALLCOV2_MUTAGENESIS.txt")%>%
distinct()%>% mutate(BA_Rank = as.numeric(BA_Rank))%>%
mutate(nlog2Rank = - log2(BA_Rank))
INPUT_DATA_TRAPP=INPUT_DATA_TRAPP %>% mutate(nlog2Rank = round(nlog2Rank, digits=4))
FullDataset=FullDataset%>% mutate(nlog2Rank = round(nlog2Rank, digits=4))
FullDataset=FullDataset %>% inner_join(INPUT_DATA_TRAPP)
## Joining, by = c("Peptide", "nlog2Rank", "Dataset", "Predicted_Binding")
# Clean. Filter for Wuhan WT peptide
FullDataset %>% select(Dataset)%>% table
## .
## Mutagenesis OmicronTest SUBVAR_MT_TEST SUBVAR_WT_TEST WuhanTest
## 86984 431 324 324 477
FullDataset=FullDataset%>%dplyr::rename(Prediction=prediction)
FullDataset=FullDataset %>% select(Peptide, Dataset, Predicted_Binding, Prediction)
FullDataset=FullDataset %>% filter(Peptide %in% MUTATIONS$Peptide)
# AASeq1 is the WT peptide.
FullDataset=FullDataset %>% select(! Dataset)%>% distinct()%>% dplyr::rename(AASeq1=Peptide, WT_Prediction=Prediction)
# Leftjoin with our mutagenesis dataset
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>% left_join(FullDataset)
## Joining, by = c("AASeq1", "Predicted_Binding")
# Impute a score for those where WT was a nonbinder but MT now binds. This MT may now be immunogenic thus must be analysed.
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>%
mutate(WT_Prediction = replace(WT_Prediction, (Binder == "NONBINDER" & MT_Binder == "BINDER"), 0.01))
# Are there any NA generated from the left join? No, this means we have a WT and MT score for each observation.
colnms = colnames(INSILICO_MUTAGENESIS_WUHANOMICRON)
INSILICO_MUTAGENESIS_WUHANOMICRON %>%
filter_at(vars(all_of(colnms)), any_vars(is.na(.)))
## Empty data.table (0 rows and 12 cols): AASeq1,AASeq2,BA_Rank,Binder,Length,MT_BA_Rank...
# Cleaning.
INSILICO_MUTAGENESIS_WUHANOMICRON=INSILICO_MUTAGENESIS_WUHANOMICRON %>%
select(!Prediction)%>% dplyr::rename(Prediction = AASeq2_Prediction, WuhanScore = WT_Prediction)
FULL_PREDICTIONS_DT=INSILICO_MUTAGENESIS_WUHANOMICRON%>% dplyr::rename(Peptide=AASeq1)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% filter(! Peptide == AASeq2)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% dplyr::rename(TRAPP_MUTANT_Prediction=Prediction)%>%
dplyr::rename(TRAPP_Prediction = WuhanScore)%>% mutate(BA_Rank = as.numeric(BA_Rank))%>%
mutate(MT_BA_Rank = as.numeric(MT_BA_Rank))
# Now, we want to produce an overall score incorporating both (TRAP score * MHC binding score)
# MHCScore is produced by a negative logit function of the Rank score. X-2 as the threshold for 2 being the cutoff threshold for binding.
# 1.5 selected abritrarily to reduce the magnitude of the curve.. so rank scores of e.g., 2.1 aren't treated as harshly as they may be with a higher value such as 5. We are here beign a little conservative.
#negative logistic function
neg_logit_function = function(x) {
1/(1+exp(1.5*(x-2)))
}
# Transform WT and MT MHCScore
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(MHCScore = neg_logit_function(BA_Rank))%>% mutate(MT_MHCScore = neg_logit_function(MT_BA_Rank))
#FULL_MUTATIONS_DT %>% mutate(MHCScore = min_max_norm(BA_Rank))%>% mutate(MT_MHCScore = min_max_norm(MT_BA_Rank))
# Calculate the Wuhan Immunogenicity score and the MT Immunogenicity score (Prediction)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(WuhanScore = TRAPP_Prediction * MHCScore)%>% mutate(Prediction = TRAPP_MUTANT_Prediction* MT_MHCScore)
# To avoid huge skews at the limit approaching zero, we replace anything with <0.01 with 0.01
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(WuhanScore = replace(WuhanScore, WuhanScore<0.01, 0.01))%>%
mutate(Prediction = replace(Prediction, Prediction<0.01, 0.01))
#FULL_MUTATIONS_DT=FULL_MUTATIONS_DT %>% rbind(NA_SET_BINDER_NONBINDER)%>% rbind(NA_SET_NONBINDER_BINDER)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(WuhanScore = round(WuhanScore, digits=4))%>%
mutate(Prediction = round(Prediction, digits=4))%>% distinct()
# Anything rounded to == 0 is set to 0.01
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(WuhanScore = replace(WuhanScore, WuhanScore==0.0, 0.01))%>%
mutate(Prediction = replace(Prediction, Prediction==0.0, 0.01))
FULL_PREDICTIONS_DT %>% group_by(Peptide)%>% dplyr::summarise(meanScore = mean(WuhanScore))%>% slice_min(order_by = meanScore)
## # A tibble: 1 × 2
## Peptide meanScore
## <chr> <dbl>
## 1 NGVEGFNCY 0.204
# Remove likely FN
FN_EPS=FULL_PREDICTIONS_DT %>% group_by(Peptide)%>% dplyr::summarise(maxScore = max(WuhanScore))%>% filter(maxScore < 0.45)%>% pull(Peptide)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT%>% filter(! Peptide %in% FN_EPS)
# Get rid of pMHC observations where WT and MT unlikely to be immunogenic.
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% filter(!(Prediction < 0.35 & WuhanScore < 0.35))
FULL_PREDICTIONS_DT %>% dplyr::rename("Variant Peptide"=AASeq2, "HLA Allele"=Predicted_Binding,MutantScore=Prediction,TRAPP_Wuhan=TRAPP_Prediction)%>%
readr::write_csv(file="/Users/paulbuckley/Nexus365/WIMM CCB - Koohy Group - Documents/Koohy Group/Effect_Mutation_Tcells/V9/WORKING_VERSION/Supplementary Tables/Table3_MutagenesisPredictions.csv")
#saveRDS(FULL_PREDICTIONS_DT, file="INSILICO_MUTAGENESIS_COMPILED_PREDICTIONS_SUBONLY.rds")
neighborNetwork <- function(
peptideSet,
numSet=NULL,
directed=F,
weighted=F,
coreN=1
){
# Start calculation
set.seed(12345)
time.start <- proc.time()
# Explore neighbor pairs
peptideList <- split(peptideSet, nchar(peptideSet))
cat("Searching neighbors by single-aa substitutions...\n")
dt_neighbor_sub <- foreach::foreach(i=1:length(peptideList), .inorder=F)%do%{
l <- nchar(peptideList[[i]][1])
dt_edges <- apply(data.table::CJ(1:l, Biostrings::AA_STANDARD), 1,
function(v){
aa_pos <- as.numeric(v[1])
aa_sub <- v[2]
s <- peptideList[[i]]
s_prime <- s
stringr::str_sub(s_prime, start=aa_pos, end=aa_pos) <- aa_sub
pairPos <- setdiff(which(s_prime %in% s), which(s_prime==s))
dt_edges <- data.table::data.table(
"AASeq1"=s,
"AASeq2"=s_prime,
"AA1"=stringr::str_sub(s, start=aa_pos, end=aa_pos),
"MutPosition"=aa_pos,
"AA2"=aa_sub,
"MutPattern"="Substitution"
)
dt_edges <- dt_edges[pairPos,]
dt_edges[,MutType:=paste0(AA1, "_", MutPosition, "_", AA2)][,AA1:=NULL][,MutPosition:=NULL][,AA2:=NULL]
return(dt_edges)
})
data.table::rbindlist(dt_edges)
} %>%
data.table::rbindlist() %>%
unique(fromLast=F, by=c("AASeq1","AASeq2","MutPattern"))
gc();gc()
cat("Merging...\n")
dt_neighbor <- rbind(
dt_neighbor_sub
)
dt_neighbor[,MutPattern:=factor(MutPattern, levels=c("Substitution"))]
data.table::setorder(dt_neighbor, AASeq1, AASeq2, MutPattern, MutType)
# Construct a neighbor network object
net_neighbor <- igraph::graph_from_data_frame(dt_neighbor, directed=F)
net_neighbor <- igraph::set_vertex_attr(net_neighbor, name="label", value=igraph::V(net_neighbor)$name)
if(is.null(numSet)){
out <- list("NeighborNetwork"=net_neighbor, "PairDF"=dt_neighbor)
}else{
## Weighted and directed network using a given set of scores
## Direction is determined such that the score always increases after mutation
dt_num <- data.table::data.table("Peptide"=peptideSet, "Score"=numSet)
dt_neighbor_dw <- data.table::copy(dt_neighbor)
dt_neighbor_dw <- merge(dt_neighbor_dw, dt_num, by.x="AASeq1", by.y="Peptide")
dt_neighbor_dw <- merge(dt_neighbor_dw, dt_num, by.x="AASeq2", by.y="Peptide", suffixes=c("1","2"))
dt_neighbor_dw <- dt_neighbor_dw[Score2>=Score1,]
dt_neighbor_dw[,ScoreRatio:=Score2/Score1]
dt_neighbor_dw[,EdgeWeight:=Score1/Score2]
dt_neighbor_dw[EdgeWeight<0.001, EdgeWeight:=0.001] ### set an arbitrary minimum threshold
net_neighbor_dw <- igraph::graph_from_data_frame(dt_neighbor_dw, directed=directed)
net_neighbor_dw <- igraph::set_vertex_attr(net_neighbor_dw, name="label", value=igraph::V(net_neighbor_dw)$name)
if(weighted==T) igraph::E(net_neighbor_dw)$weight <- igraph::E(net_neighbor_dw)$EdgeWeight
out <- list(
"NeighborNetwork"=net_neighbor, "PairDF"=dt_neighbor,
"NeighborNetwork_DW"=net_neighbor_dw, "PairDF_DW"=dt_neighbor_dw
)
}
# Finish the timer
time.end <- proc.time()
message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
# Output
gc();gc()
return(out)
}
neighborNetwork_Cluster <- function(peptide, graph, metadataDF, seed=12345, plot=T){
## Peptide labels
peptideLabels <- peptide
peptideSet <- igraph::V(graph)$"name"
igraph::V(graph)$label[!peptideSet %in% peptideLabels] <- ""
## Metadata
metadataDF <- dplyr::filter(metadataDF, Peptide %in% peptideSet)
df_meta <- dplyr::select(metadataDF, Peptide, ImmunogenicityScore)
if("Immunogenicity" %in% colnames(metadataDF)){
df_meta$Immunogenicity <- metadataDF$Immunogenicity
igraph::V(graph)$Immunogenicity <- df_meta$Immunogenicity
}
igraph::V(graph)$ImmunogenicityScore <- df_meta$ImmunogenicityScore
## Clusters
set.seed(seed)
df_meta <- dplyr::left_join(
dplyr::tibble("Peptide"=peptideSet,
"Target"=dplyr::if_else(peptideSet==peptide, "Target", "Neighbor"),
"ClusterID"=paste0("Cluster", igraph::cluster_walktrap(graph)$membership)),
df_meta,
by="Peptide"
)
## No plot ver.
if(plot!=T) return(df_meta)
## Coordinates
set.seed(seed)
l <- igraph::layout_nicely(graph)
df_meta <- dplyr::left_join(
magrittr::set_colnames(cbind(dplyr::tibble("Peptide"=peptideSet), as.data.frame(l)), c("Peptide","x","y")),
df_meta,
by="Peptide"
)
## Consensus per cluster
clusteredPeptides <- df_meta %>%
dplyr::arrange(ClusterID) %>%
dplyr::mutate(ClusterID=as.character(ClusterID)) %>%
data.table::as.data.table() %>%
split(by="ClusterID") %>%
lapply(function(d){d[["Peptide"]]})
consensusSequence <- function(sequenceSet){
if(length(sequenceSet)==1) return(sequenceSet)
sink(tempfile())
s <- msa::msaConsensusSequence(msa::msaClustalW(sequenceSet, type="protein"), type="Biostrings", ambiguityMap="X", threshold=0.5)
sink()
return(s)
}
clusterConsensusSeqs <- sapply(clusteredPeptides, consensusSequence)
clusterConsensusSeqs <- stringr::str_replace_all(clusterConsensusSeqs, stringr::fixed("-"), "X")
## Graph plot
clusterGraphPlot <- function(g, meta, layout, seed=12345){
### Vertex annotations
colPal <- function(x){
pal <- colorRamp(append(ggsci::pal_d3()(2), "white", after=1), space="rgb")
cols <- pal(x)
apply(cols, 1, function(x){rgb(x[1], x[2], x[3], maxColorValue=255)})
}
clusterColors <- meta %>%
dplyr::group_by(ClusterID) %>%
dplyr::summarise(ImmunogenicityColor=colPal(mean(ImmunogenicityScore)))
clusterColors <- scales::alpha(clusterColors$"ImmunogenicityColor", alpha=0.75)
vertexColors <- colPal(meta$"ImmunogenicityScore")
if(is.null(meta$"Immunogenicity")){
vertexShapes <- "circle"
}else{
vertexShapes <- dplyr::if_else(meta$"Immunogenicity"=="Positive", "circle", "square")
}
vertexLabels <- igraph::V(g)$"label"
### Cluster labels
clusterCentroids <- meta %>%
dplyr::group_by(ClusterID) %>%
dplyr::summarise(x=mean(x), y=mean(y))
clusterCentroids$x <- scales::rescale(clusterCentroids$x, to=c(-1, 1))
clusterCentroids$y <- scales::rescale(clusterCentroids$y, to=c(-1, 1))
### Graph
try(dev.off(), silent=T)
set.seed(seed)
plot(
igraph::cluster_walktrap(g), g,
layout=layout,
col=vertexColors,
mark.border="black",
mark.col=clusterColors,
vertex.size=3,
vertex.shape=vertexShapes,
vertex.label=vertexLabels,
vertex.label.color="black",
vertex.label.cex=1.25,
vertex.label.dist=0.5,
vertex.label.family="sans",
vertex.color=vertexColors,
vertex.frame.color="black",
edge.width=0.5,
edge.arrow.size=0.25,
edge.arrow.width=0.1,
edge.color=scales::alpha("gray50", 0.75)
)
for(i in 1:length(clusterCentroids$ClusterID)){
text(
rep(clusterCentroids$x[[i]], 2),
clusterCentroids$y[[i]]+c(0, -0.1),
c(clusterCentroids$ClusterID[[i]], clusterConsensusSeqs[[i]]),
pos=3, cex=1.25, family="sans"
)
}
return(recordPlot())
}
neighborPlot <- clusterGraphPlot(
g=graph,
meta=df_meta,
layout=l,
seed=seed
)
return(list(
"SummaryDF"=df_meta,
"NeighborPlot"=neighborPlot
))
}
#' @export
#' @rdname NeighborNetwork_Clustering
#' @name NeighborNetwork_Clustering
neighborNetwork_Cluster_Batch <- function(neighborNetResult, metadataDF, seed=12345, coreN=parallel::detectCores(logical=F),origPeptide=PEPTIDE){
## Extract peptide sequences
peptideSet <- igraph::V(neighborNetResult$"NeighborNetwork_DW")$"name"
#orig=origPeptide
## Cluster analysis
message("Clustering neighbor network...")
res <- foreach::foreach(pept=peptideSet, .inorder=F)%do%{
graph <- Repitope::neighborNetwork_ConnectedSubGraph(neighborNetResult, pept)
df_meta <- neighborNetwork_Cluster(pept, graph, metadataDF, seed)
pos <- grep("Target", df_meta$"Target")
clust <- df_meta$"ClusterID"[[pos]]
score <- df_meta$"ImmunogenicityScore"[[pos]]
list("Peptide"=pept, "Score"=score, "ClusterID"=clust, "SummaryDF"=df_meta)
}
gc();gc()
return(res)
}
neighborNetwork_Cluster_FeatureDF <- function(neighborNetClusterResult, coreN=parallel::detectCores(logical=F)){
message("Computing cluster-based metrics...")
dt_feat <- foreach::foreach(i=1:length(neighborNetClusterResult), .inorder=F, .packages=c("dplyr","data.table"))%do%{
summaryDF <- neighborNetClusterResult[[i]][["SummaryDF"]]$SummaryDF
summaryDF <- dplyr::arrange(summaryDF, dplyr::desc(Target))
dt <- data.table::as.data.table(summaryDF[1,])
aveDF <- summaryDF %>%
dplyr::group_by(ClusterID) %>%
dplyr::summarise(ImmunogenicityScore=mean(ImmunogenicityScore))
dt[,ImmunogenicityScore_Cluster_Average:=dplyr::filter(aveDF, ClusterID==dt$"ClusterID"[1])$"ImmunogenicityScore"]
aveDF <- dplyr::filter(aveDF, ClusterID!=dt$"ClusterID"[1])
dt[,ImmunogenicityScore_Cluster_Diff_Max:=max(aveDF$"ImmunogenicityScore") - ImmunogenicityScore_Cluster_Average]
dt[,ImmunogenicityScore_Cluster_Diff_Min:=ImmunogenicityScore_Cluster_Average - min(aveDF$"ImmunogenicityScore")] ## Considered to be an "escape potential"
dt[,ImmunogenicityScore_Diff_Max:=max(summaryDF$"ImmunogenicityScore") - ImmunogenicityScore]
dt[,ImmunogenicityScore_Diff_Min:=ImmunogenicityScore - min(summaryDF$"ImmunogenicityScore")]
dt
} %>%
data.table::rbindlist()
gc();gc()
return(dt_feat)
}
neighborNetwork_Cluster <- function(peptide, graph, metadataDF, seed=12345, plot=T){
## Peptide labels
peptideLabels <- peptide
peptideSet <- igraph::V(graph)$"name"
igraph::V(graph)$label[!peptideSet %in% peptideLabels] <- ""
## Metadata
metadataDF <- dplyr::filter(metadataDF, Peptide %in% peptideSet)
df_meta <- dplyr::select(metadataDF, Peptide, ImmunogenicityScore)
if("Immunogenicity" %in% colnames(metadataDF)){
df_meta$Immunogenicity <- metadataDF$Immunogenicity
igraph::V(graph)$Immunogenicity <- df_meta$Immunogenicity
}
igraph::V(graph)$ImmunogenicityScore <- df_meta$ImmunogenicityScore
## Clusters
set.seed(seed)
df_meta <- dplyr::left_join(
dplyr::tibble("Peptide"=peptideSet,
"Target"=dplyr::if_else(peptideSet==peptide, "Target", "Neighbor"),
"ClusterID"=paste0("Cluster", igraph::cluster_walktrap(graph)$membership)),
df_meta,
by="Peptide"
)
## No plot ver.
if(plot!=T) return(df_meta)
## Coordinates
set.seed(seed)
l <- igraph::layout_nicely(graph)
df_meta <- dplyr::left_join(
magrittr::set_colnames(cbind(dplyr::tibble("Peptide"=peptideSet), as.data.frame(l)), c("Peptide","x","y")),
df_meta,
by="Peptide"
)
## Consensus per cluster
clusteredPeptides <- df_meta %>%
dplyr::arrange(ClusterID) %>%
dplyr::mutate(ClusterID=as.character(ClusterID)) %>%
data.table::as.data.table() %>%
split(by="ClusterID") %>%
lapply(function(d){d[["Peptide"]]})
consensusSequence <- function(sequenceSet){
if(length(sequenceSet)==1) return(sequenceSet)
sink(tempfile())
s <- msa::msaConsensusSequence(msa::msaClustalW(sequenceSet, type="protein"), type="Biostrings", ambiguityMap="X", threshold=0.5)
sink()
return(s)
}
clusterConsensusSeqs <- sapply(clusteredPeptides, consensusSequence)
clusterConsensusSeqs <- stringr::str_replace_all(clusterConsensusSeqs, stringr::fixed("-"), "X")
## Graph plot
clusterGraphPlot <- function(g, meta, layout, seed=12345){
### Vertex annotations
colPal <- function(x){
pal <- colorRamp(append(ggsci::pal_d3()(2), "white", after=1), space="rgb")
cols <- pal(x)
apply(cols, 1, function(x){rgb(x[1], x[2], x[3], maxColorValue=255)})
}
clusterColors <- meta %>%
dplyr::group_by(ClusterID) %>%
dplyr::summarise(ImmunogenicityColor=colPal(mean(ImmunogenicityScore)))
clusterColors <- scales::alpha(clusterColors$"ImmunogenicityColor", alpha=0.75)
vertexColors <- colPal(meta$"ImmunogenicityScore")
if(is.null(meta$"Immunogenicity")){
vertexShapes <- "circle"
}else{
vertexShapes <- dplyr::if_else(meta$"Immunogenicity"=="Positive", "circle", "square")
}
vertexLabels <- igraph::V(g)$"label"
### Cluster labels
clusterCentroids <- meta %>%
dplyr::group_by(ClusterID) %>%
dplyr::summarise(x=mean(x), y=mean(y))
clusterCentroids$x <- scales::rescale(clusterCentroids$x, to=c(-1, 1))
clusterCentroids$y <- scales::rescale(clusterCentroids$y, to=c(-1, 1))
### Graph
try(dev.off(), silent=T)
set.seed(seed)
plot(
igraph::cluster_walktrap(g), g,
layout=layout,
col=vertexColors,
mark.border="black",
mark.col=clusterColors,
vertex.size=3,
vertex.shape=vertexShapes,
vertex.label=vertexLabels,
vertex.label.color="black",
vertex.label.cex=1.25,
vertex.label.dist=0.5,
vertex.label.family="sans",
vertex.color=vertexColors,
vertex.frame.color="black",
edge.width=0.5,
edge.arrow.size=0.25,
edge.arrow.width=0.1,
edge.color=scales::alpha("gray50", 0.75)
)
for(i in 1:length(clusterCentroids$ClusterID)){
text(
rep(clusterCentroids$x[[i]], 2),
clusterCentroids$y[[i]]+c(0, -0.1),
c(clusterCentroids$ClusterID[[i]], clusterConsensusSeqs[[i]]),
pos=3, cex=1.25, family="sans"
)
}
return(recordPlot())
}
neighborPlot <- clusterGraphPlot(
g=graph,
meta=df_meta,
layout=l,
seed=seed
)
return(list(
"SummaryDF"=df_meta,
"NeighborPlot"=neighborPlot
))
}
FILEPATH = "NEIGHBOR_NETWORK_ANALYSIS_TRAPP/"
dir.create(FILEPATH)
# For each unique WT wuhan peptide.
for(i in 1:length(unique(FULL_PREDICTIONS_DT$Peptide))){
PEPTIDE = unique(FULL_PREDICTIONS_DT$Peptide)[i]
DATASET = FULL_PREDICTIONS_DT %>% filter(Peptide%in% PEPTIDE)
# Find the Wuhan scores across different pMHC
WUHAN_SCORE_DISTR = DATASET %>% filter(Binder == "BINDER") %>% select(Peptide, Predicted_Binding, WuhanScore) %>% distinct() %>% mutate(Dataset="Wuhan") %>% dplyr::rename(Prediction=WuhanScore)
# Find the mutants
MT_SCORE_DIST = DATASET %>% select(AASeq2, Predicted_Binding, Prediction) %>% distinct() %>% mutate(Dataset="MT")%>% dplyr::rename(Peptide=AASeq2)
# For the original score across pMHC, take the mean
ORIG_SCORE=DATASET %>% filter(Binder == "BINDER") %>% select(Peptide, Predicted_Binding, WuhanScore) %>% distinct() %>% group_by(Peptide) %>% dplyr::summarise(ImmunogenicityScore=mean(WuhanScore))%>% ungroup()
# Average across MHC for each WT-MT unique sample
DATASET=DATASET%>% group_by(Peptide, AASeq2) %>% dplyr::summarise(ImmunogenicityScore=mean(Prediction))
# Generate the neighbour network
DATASET_NEIGHBOUR = DATASET %>% ungroup%>% select(!Peptide)%>% dplyr::rename(Peptide=AASeq2)
DATASET_NEIGHBOUR=DATASET_NEIGHBOUR %>% as.data.table
DATASET_NEIGHBOUR=ORIG_SCORE %>% rbind(DATASET_NEIGHBOUR)%>% as.data.table
nnet_ISM <- neighborNetwork(DATASET_NEIGHBOUR$Peptide, DATASET_NEIGHBOUR$ImmunogenicityScore)
PEPTIDE_FILEPATH= paste0(FILEPATH,"/",PEPTIDE,"/")
dir.create(PEPTIDE_FILEPATH)
saveRDS(nnet_ISM, file=paste0(PEPTIDE_FILEPATH,"/NeighborNetwork.rds"))
# Show the density of scores
DENSITY_PLT=WUHAN_SCORE_DISTR %>% rbind(MT_SCORE_DIST)%>%
ggdensity(x="Prediction",y="..density..",fill="Dataset")
ggsave(filename = paste0(PEPTIDE_FILEPATH,"/Density.pdf"))
seed=1234
# Create the cluster for neighbor network. As this is substitution based, this is clustered simply on position of mutation
nnet_ISM_cluster <- neighborNetwork_Cluster_Batch(nnet_ISM, DATASET_NEIGHBOUR[,.(Peptide,ImmunogenicityScore)],seed=seed)
# Create the featureDF
nnet_ISM_cluster_featureDF <- neighborNetwork_Cluster_FeatureDF(nnet_ISM_cluster, coreN=5)
nnet_ISM_cluster_featureDF=nnet_ISM_cluster_featureDF %>% mutate(Peptide_Type = ifelse(Peptide == PEPTIDE, "Original","InSilicoMutated"))
saveRDS(nnet_ISM_cluster, file.path(PEPTIDE_FILEPATH, "/NeighborNetwork_Cluster.rds"))
readr::write_csv(nnet_ISM_cluster_featureDF, file.path(PEPTIDE_FILEPATH, "NeighborNetwork_Cluster_FeatureDF.csv"))
gc();gc()
# Create Ogishi's summary DT. We don't use this really as it is messy.
summaryDT <- merge(DATASET_NEIGHBOUR[Peptide %in% PEPTIDE,]%>% mutate(Peptide_Type="Original"), nnet_ISM_cluster_featureDF[Peptide %in% PEPTIDE,], by=c("Peptide","Peptide_Type","ImmunogenicityScore"))
summaryDT[,EscapePotential:=ImmunogenicityScore_Cluster_Diff_Min]
summaryDT <- summaryDT[Peptide_Type=="Original", .(Peptide, ImmunogenicityScore, ImmunogenicityScore_Cluster_Average, EscapePotential)]
data.table::setorder(summaryDT, -ImmunogenicityScore, -ImmunogenicityScore_Cluster_Average, EscapePotential)
readr::write_csv(summaryDT, file.path(PEPTIDE_FILEPATH, "EpitopePrioritizationSummary.csv"))
}
neighborNetwork_ConnectedSubGraph <- function(neighborNetResult, peptide){
connectedPeptides <- c(
peptide,
dplyr::filter(neighborNetResult$"PairDF_DW", AASeq1==peptide)$"AASeq2",
dplyr::filter(neighborNetResult$"PairDF_DW", AASeq2==peptide)$"AASeq1"
)
connectedSubgraph <- igraph::induced_subgraph(
neighborNetResult$"NeighborNetwork_DW",
which(igraph::V(neighborNetResult$"NeighborNetwork_DW")$label %in% connectedPeptides)
)
return(connectedSubgraph)
}
# Read in the summary dataset. This is generated by Ogishi and uses his 'escape potential' metric
# I dont like this metric so i will use avg log ratios for each WT across all mutants
list_of_files = list.files(path="NEIGHBOR_NETWORK_ANALYSIS_TRAPP/",
recursive = T,
pattern = "EpitopePrioritizationSummary.csv",
full.names = T)
EPITOPE_PRIORITIZATION_DT = list_of_files %>% purrr::map_df(fread)
EPITOPE_PRIORITIZATION_DT %>% ggplot(aes(x=reorder(Peptide, -EscapePotential), y=EscapePotential))+geom_bar(stat = "identity",fill="darkgrey")+theme_pubr()+
rotate_x_text()+xlab("Peptide")+ geom_hline(yintercept = 0.1, linetype="dashed",color="white")+ geom_hline(yintercept = 0.2, linetype="dashed",color="white")
# Link information about each simulated mutant with the corresponding WT-MT Prediction scores, given TRAP and/or imputation
MUTATIONDATA = readRDS("INSILICO_MUTAGENESIS_WUHANOMICRON_SUB_MUTATIONDATA.rds")%>% dplyr::rename(AASeq2=VariantAlignment)
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT%>% left_join(MUTATIONDATA)
## Joining, by = c("Peptide", "AASeq2")
#saveRDS(FULL_PREDICTIONS_DT, file="INSILICO_MUTAGENESIS_COMPILED_PREDICTIONS_SUBONLY_w_MUTATIONDATA.rds")
# Create logs ratios for each sample.
# Poorly named: Prediction = Omicron overall immunogenicity score.
FULL_PREDICTIONS_DT=FULL_PREDICTIONS_DT %>% mutate(logsOdds = log(Prediction/WuhanScore))
FULL_PREDICTIONS_DT = FULL_PREDICTIONS_DT %>% mutate(Length=nchar(Peptide))
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## groupvars: a vector containing names of columns that contain grouping variables
## na.rm: a boolean that indicates whether to ignore NA's
## conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
LENGTH=9
POSITIONS = c(1,2,9)
# What are the mutations which on average cause biggest increases in immunogenicity? Take top 10%
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% # Filter anchor pos
filter(Length == LENGTH)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% # filter length. Define mutation
group_by(Mutation)%>% dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% # For each A_X_B mutation, take the mean logs odds.
arrange(desc(meanlogsOdds))%>% filter(! n<20)%>% # Exclude any samples with n<20
slice_max(order_by = meanlogsOdds, prop = 0.10) # Take top 10%
# What are the mutations which on average cause biggest decreases in immunogenicity?Take bottom 10%
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>%
filter(Length == LENGTH)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>%
group_by(Mutation)%>% dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>%
arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.10)
# Create vector of this combined set of mutations of interest
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)
# Clean and curate the data of top/bottom 10%. Filter for positions of interest and appropriate peptide lengths. Get rid of SeqMutPos==0 if exists which is generated in old cases for earlier stat comparison.
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>%
filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
# Create the summary statistics: what is mean logs odds for each mutation and SD/SE.
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("Mutation"))
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
##
## compact
## The following object is masked from 'package:ggpubr':
##
## mutate
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
# Plot
MUTS_DT%>%
ggplot(aes(x=reorder(Mutation, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")
## change from
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeFromPos)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>%
group_by(ChangeFromPos)%>% filter(ChangeFromPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFromPos"))
MUTS_DT%>%
ggplot(aes(x=reorder(ChangeFromPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")
## change to
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.30)
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>%
filter(Length == LENGTH)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.3)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))
MUTS_DT%>%
ggplot(aes(x=reorder(ChangeToPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")
# Color scheme based on chemistry of amino acids
# Nicked from ggseqlogo
chemistry = data.frame(
letter = c('G', 'S', 'T', 'Y', 'C', 'N', 'Q', 'K', 'R', 'H', 'D', 'E', 'P', 'A', 'W', 'F', 'L', 'I', 'M', 'V'),
group = c(rep('Polar', 5), rep('Neutral', 2), rep('Basic', 3), rep('Acidic', 2), rep('Hydrophobic', 8)),
col = c(rep('#109648', 5), rep('#5E239D', 2), rep('#255C99', 3), rep('#D62839', 2), rep('#221E22', 8)),
stringsAsFactors = F
)
LENGTH=9
POSITIONS = c(3,4,5,6,7,8)
# Filter for TCR contact pos in 9mers
# First, only look at positional effects
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)
# Analyse
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("SeqMutationPos"))
# Plot logs odds
MUTS_DT%>%
ggplot(aes(x=reorder(SeqMutationPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Sequence Position")+ylab("Logs Odds ratio (MT/WT)")
# color by chenistry
colors = unique(chemistry$col) ; names(colors) = unique(chemistry$group)
#Plot for ChangeFrom amino acid - 9mers.
# Now, look at 'ChangeFrom' amino acid.
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFrom"))
CF_MUTS_DT=MUTS_DT %>% mutate(letter = ChangeFrom) %>% inner_join(chemistry) %>% dplyr::rename(logsOdds_removal=logsOdds)
## Joining, by = "letter"
NINE_CONTACT_FROM_PLT=MUTS_DT %>% mutate(letter = ChangeFrom) %>% inner_join(chemistry)%>%
ggplot(aes(x=reorder(ChangeFrom, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Amino acid (wildtype removal))")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
NINE_CONTACT_FROM_PLT
saveRDS(NINE_CONTACT_FROM_PLT,file="NINE_CONTACT_FROM_PLT.rds")
#Plot for ChangeTo amino acid - 9mers.
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeTo"))
CT_MUTS_DT=MUTS_DT %>% mutate(letter = ChangeTo) %>% inner_join(chemistry) %>% dplyr::rename(logsOdds_replacement=logsOdds)
## Joining, by = "letter"
NINE_CONTACT_TO_PLT=MUTS_DT %>% mutate(letter = ChangeTo) %>% inner_join(chemistry)%>%
ggplot(aes(x=reorder(ChangeTo, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Amino acid (mutant replacement)")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
NINE_CONTACT_TO_PLT
saveRDS(NINE_CONTACT_TO_PLT,file="NINE_CONTACT_TO_PLT.rds")
#Supplementary scatter plot
CF_MUTS_DT=CF_MUTS_DT %>% select(letter, group, col, logsOdds_removal)
CT_MUTS_DT=CT_MUTS_DT %>% select(letter, group, col, logsOdds_replacement)
NINE_CONTACT_SCATTER_PLT=CF_MUTS_DT %>% inner_join(CT_MUTS_DT) %>%
ggplot(aes(x=logsOdds_removal, y=logsOdds_replacement, color=group))+geom_point()+
ggrepel::geom_text_repel(aes(label = letter, color = group), size = 3)+theme_pubr(base_size = 16)+scale_color_manual(values=colors)+
ylab("Logs Odds Ratio (Replacement)")+xlab("Logs Odds Ratio (Removal)")+labs(color="chemistry")+guides(color=guide_legend(nrow=2,byrow=TRUE))
## Joining, by = c("letter", "group", "col")
saveRDS(NINE_CONTACT_SCATTER_PLT,file="NINE_CONTACT_SCATTER_PLT.rds")
# Top 5% of 9mer contact position mutations. I.e inducing most immunogenic changes.
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>% # Filter 9mer contact
mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>% # Create mutation of form A_X_B
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>% # Summarise and !n<20
slice_max(order_by = meanlogsOdds, prop = 0.05) # Top 5%
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.05) # min 5%
# Bind mutations
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)
# Create the dataset based on above mutations and filter settings.
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>%
filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("Mutation"))
# Create plot: NINEMER CONTACT, FULL MUTATION.
NINE_CONTACT_TO_SPECIFICMUT_PLT=MUTS_DT%>% mutate(ChangeFrom = stringr::str_extract(Mutation,"[A-Z]\\_"))%>%
mutate(ChangeFrom = gsub("\\_", "", ChangeFrom))%>% mutate(letter = ChangeFrom) %>%
inner_join(chemistry)%>%
ggplot(aes(x=reorder(Mutation, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values = colors)
## Joining, by = "letter"
NINE_CONTACT_TO_SPECIFICMUT_PLT
saveRDS(NINE_CONTACT_TO_SPECIFICMUT_PLT,file="NINE_CONTACT_TO_SPECIFICMUT_PLT.rds")
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3) # Take top 30% of 'ChangeFrom' amino acids for 9mers in contact position
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.3) # Take bottom 30% of 'ChangeFrom' amino acid changes for 9mers in contact pos.
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeFromPos)
# Create DT.
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
filter(ChangeFromPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFromPos"))
# Plot.
MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeFromPos,"[A-Z]"))%>% inner_join(chemistry)%>%
ggplot(aes(x=reorder(ChangeFromPos, -logsOdds), y=logsOdds, fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3) # Take top 30%.
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.3)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))
MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeToPos,"[A-Z]"))%>% inner_join(chemistry)%>%
ggplot(aes(x=reorder(ChangeToPos, -logsOdds), y=logsOdds, fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
LENGTH=10
POSITIONS = c(1,2,10)
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.05)
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.05)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>%
filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>%
filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("Mutation"))
MUTS_DT%>%
ggplot(aes(x=reorder(Mutation, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3)
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.3)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeFromPos)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
filter(ChangeFromPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFromPos"))
MUTS_DT%>%
ggplot(aes(x=reorder(ChangeFromPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3)
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.3)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))
MUTS_DT%>%
ggplot(aes(x=reorder(ChangeToPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")
LENGTH=10
POSITIONS = c(3,4,5,6,7,8,9)
# Analyse positional impact
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("SeqMutationPos"))
MUTS_DT%>%
ggplot(aes(x=reorder(SeqMutationPos, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Sequence Position")+ylab("Logs Odds ratio (MT/WT)")
# Analyse the effect simply of the 'ChangeFrom' amino acid
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFrom"))
CF_MUTS_DT=MUTS_DT %>% mutate(letter = ChangeFrom) %>% inner_join(chemistry) %>% dplyr::rename(logsOdds_removal=logsOdds)
## Joining, by = "letter"
TEN_CONTACT_FROM_PLT=MUTS_DT %>% mutate(letter = ChangeFrom) %>% inner_join(chemistry)%>%
ggplot(aes(x=reorder(ChangeFrom, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Amino acid (wildtype removal)")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
TEN_CONTACT_FROM_PLT
saveRDS(TEN_CONTACT_FROM_PLT,file="TEN_CONTACT_FROM_PLT.rds")
# Analyse the effect of the 'ChangeTo' amino acid
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeTo"))
CT_MUTS_DT=MUTS_DT %>% mutate(letter = ChangeTo) %>% inner_join(chemistry) %>% dplyr::rename(logsOdds_replacement=logsOdds)
## Joining, by = "letter"
TEN_CONTACT_TO_PLT=MUTS_DT %>% mutate(letter = ChangeTo) %>% inner_join(chemistry)%>%
ggplot(aes(x=reorder(ChangeTo, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+xlab("Amino acid (mutant replacement)")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
TEN_CONTACT_TO_PLT
saveRDS(TEN_CONTACT_TO_PLT,file="TEN_CONTACT_TO_PLT.rds")
#Supplementary scatter plot
CF_MUTS_DT=CF_MUTS_DT %>% select(letter, group, col, logsOdds_removal)
CT_MUTS_DT=CT_MUTS_DT %>% select(letter, group, col, logsOdds_replacement)
TEN_CONTACT_SCATTER_PLT=CF_MUTS_DT %>% inner_join(CT_MUTS_DT) %>%
ggplot(aes(x=logsOdds_removal, y=logsOdds_replacement, color=group))+geom_point()+
ggrepel::geom_text_repel(aes(label = letter, color = group), size = 3)+theme_pubr(base_size = 16)+scale_color_manual(values=colors)+
ylab("Logs Odds Ratio (Replacement)")+xlab("Logs Odds Ratio (Removal)")+labs(color="chemistry")+guides(color=guide_legend(nrow=2,byrow=TRUE))
## Joining, by = c("letter", "group", "col")
TEN_CONTACT_SCATTER_PLT
saveRDS(TEN_CONTACT_SCATTER_PLT,file="TEN_CONTACT_SCATTER_PLT.rds")
-A_X_B mutation form across TCR contact positionsof 10mers.
# Top 10% of mutations as described previously.
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.10)
# Bottom 10%
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.10)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>%
filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("Mutation"))
MUTS_DT%>% mutate(ChangeFrom = stringr::str_extract(Mutation,"[A-Z]\\_"))%>%
mutate(ChangeFrom = gsub("\\_", "", ChangeFrom))%>% mutate(letter = ChangeFrom) %>% inner_join(chemistry)%>%
ggplot(aes(x=reorder(Mutation, -logsOdds), y=logsOdds,fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values = colors)
## Joining, by = "letter"
# Top 30%
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3)
# Bottom 30%
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.3)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeFromPos)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(ChangeFromPos=paste0(ChangeFrom,"_", SeqMutationPos))%>% group_by(ChangeFromPos)%>%
filter(ChangeFromPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeFromPos"))
MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeFromPos,"[A-Z]"))%>% inner_join(chemistry)%>%
ggplot(aes(x=reorder(ChangeFromPos, -logsOdds), y=logsOdds, fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3)
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = 0.3)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>%
filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))
TEN_CONTACT_TO_AA_POS_PLT=MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeToPos,"[A-Z]"))%>%
inner_join(chemistry)%>%
ggplot(aes(x=reorder(ChangeToPos, -logsOdds), y=logsOdds, fill=group))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Mutation")+ylab("Logs Odds ratio (MT/WT)")+scale_fill_manual(values=colors)
## Joining, by = "letter"
TEN_CONTACT_TO_AA_POS_PLT
saveRDS(TEN_CONTACT_TO_AA_POS_PLT,file="TEN_CONTACT_TO_AA_POS_PLT.rds")
PROP=0.30
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = PROP)
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_min(order_by = meanlogsOdds, prop = PROP)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(ChangeToPos)
MUTS_FOR_ANALYSIS_DT=FULL_PREDICTIONS_DT %>% filter(SeqMutationPos %in% POSITIONS) %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
filter(ChangeToPos %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))
MUTS_DT=summarySE(MUTS_FOR_ANALYSIS_DT, measurevar="logsOdds", groupvars=c("ChangeToPos"))
DT_FOR_BIAS_ANALYSIS=MUTS_DT%>% mutate(letter = stringr::str_extract(ChangeToPos,"[A-Z]"))%>% inner_join(chemistry)%>%arrange(desc(logsOdds))%>%
ungroup() %>% dplyr::mutate(ID=dplyr::row_number())%>% mutate(Position=readr::parse_number(ChangeToPos))
DT_FOR_BIAS_ANALYSIS_OUT = foreach::foreach(i=1:(nrow(DT_FOR_BIAS_ANALYSIS)-4), .combine = "rbind")%do% {
DT_FOR_BIAS_ANALYSIS %>% filter(ID %in% seq(i,(i+4)))%>% group_by(Position) %>% dplyr::summarise(n=n())%>% mutate(ID=i)
}
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% ungroup()%>%mutate(Position=as.character(Position))%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")
POSITION=3
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
filter(Position==POSITION)%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")#+ylim(0,5)
POSITION=4
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
filter(Position==POSITION)%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")#+ylim(0,5)
POSITION=5
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
filter(Position==POSITION)%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")#+ylim(0,5)
POSITION=6
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
filter(Position==POSITION)%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")#+ylim(0,5)
POSITION=7
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
filter(Position==POSITION)%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")#+ylim(0,5)
POSITION=8
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
filter(Position==POSITION)%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")#+ylim(0,5)
POSITION=9
DT_FOR_BIAS_ANALYSIS_OUT %>% ungroup() %>% tidyr::complete(Position, ID, fill=list(n=0))%>% ungroup()%>%mutate(Position=as.character(Position))%>%
filter(Position==POSITION)%>%
group_by(ID)%>% mutate(Freq=n/sum(n))%>%
ggline(x="ID",y="n",color="Position")#+ylim(0,5)
LENGTH=9
MOST_IMM_MUTATIONS=FULL_PREDICTIONS_DT %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<50)%>%
slice_max(order_by = meanlogsOdds, prop = 0.1)
LEAST_IMM_MUTATIONS =FULL_PREDICTIONS_DT %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>% group_by(Mutation)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<50)%>%
slice_min(order_by = meanlogsOdds, prop = 0.1)
MUTS_FOR_ANALYSIS=MOST_IMM_MUTATIONS %>% rbind(LEAST_IMM_MUTATIONS) %>% pull(Mutation)
FULL_PREDICTIONS_DT %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>%
filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))%>%
mutate(OmicronAASEQ2 = ifelse(AASeq2 %in% MUTATIONS$VariantAlignment, TRUE, FALSE))%>%
ggplot(aes(x=reorder(Mutation,-logsOdds),y=logsOdds,color=OmicronAASEQ2))+geom_point(aes(alpha=OmicronAASEQ2))+theme_pubr()+rotate_x_text()+geom_hline(yintercept = 1.0,linetype="dashed")+
scale_alpha_manual(values=c(0.2,1))+
scale_x_discrete(guide = guide_axis(check.overlap = TRUE))+xlab("Peptide")+ylab("Logs Odds (Mutant/Wild-type)")
FULL_PREDICTIONS_DT %>% filter(Length == LENGTH)%>% filter(! SeqMutationPos == 0)%>% mutate(Mutation=paste0(ChangeFrom,"_", SeqMutationPos,"_",ChangeTo))%>%
filter(Mutation %in% MUTS_FOR_ANALYSIS)%>% arrange(desc(logsOdds))%>%
mutate(OmicronAASEQ2 = ifelse(AASeq2 %in% MUTATIONS$VariantAlignment, TRUE, FALSE)) %>% filter(OmicronAASEQ2==TRUE)%>% group_by(Mutation, Peptide) %>% dplyr::summarise(meanLogs = mean(logsOdds))
```{,dpi=300, fig.width = 10, fig.height = 10} seed=1234 FILEPATH = “NEIGHBOR_NETWORK_ANALYSIS_TRAPP/” for(i in 1:length(unique(FULL_PREDICTIONS_DT\(Peptide))){ # Grab required data PEPTIDE = unique(FULL_PREDICTIONS_DT\)Peptide)[i] DATASET = FULL_PREDICTIONS_DT %>% filter(Peptide%in% PEPTIDE) DATASET=DATASET%>% group_by(Peptide, AASeq2) %>% dplyr::summarise(ImmunogenicityScore=mean(Prediction))
DATASET_NEIGHBOUR = DATASET %>% ungroup%>% select(!Peptide)%>% dplyr::rename(Peptide=AASeq2) DATASET_NEIGHBOUR=DATASET_NEIGHBOUR %>% as.data.table SCORES=DATASET_NEIGHBOUR %>% mutate(Peptide_Type = ifelse(Peptide == PEPTIDE, “Original”,“InSilicoMutated”)) # Read in NNetwork NEIGHBORNETWORK = readRDS(paste0(FILEPATH,“/”,PEPTIDE,“/NeighborNetwork.rds”)) # Read in metadata metadataDF = fread(paste0(FILEPATH,“/”,PEPTIDE,“/NeighborNetwork_Cluster_FeatureDF.csv”)) peptide=PEPTIDE # Below is ogishi code to create visualisation graph=neighborNetwork_ConnectedSubGraph(NEIGHBORNETWORK, peptide) ## Peptide labels peptideLabels <- peptide peptideSet <- igraph::V(graph)\("name" igraph::V(graph)\)label[!peptideSet %in% peptideLabels] <- “”
## Metadata metadataDF <- dplyr::filter(metadataDF, Peptide %in% peptideSet) df_meta <- dplyr::select(metadataDF, Peptide, ImmunogenicityScore) if(“Immunogenicity” %in% colnames(metadataDF)){ df_meta\(Immunogenicity <- metadataDF\)Immunogenicity igraph::V(graph)\(Immunogenicity <- df_meta\)Immunogenicity } igraph::V(graph)\(ImmunogenicityScore <- df_meta\)ImmunogenicityScore
set.seed(seed) df_meta <- dplyr::left_join( dplyr::tibble(“Peptide”=peptideSet, “Target”=dplyr::if_else(peptideSet==peptide, “Target”, “Neighbor”), “ClusterID”=paste0(“Cluster”, igraph::cluster_walktrap(graph)$membership)), df_meta, by=“Peptide” ) ## Coordinates set.seed(seed) l <- igraph::layout_nicely(graph) df_meta <- dplyr::left_join( magrittr::set_colnames(cbind(dplyr::tibble(“Peptide”=peptideSet), as.data.frame(l)), c(“Peptide”,“x”,“y”)), df_meta, by=“Peptide” )
## Consensus per cluster (these are output by Repitope!). We have more clusters for this peptide than reported in Ogishi et al. SAme insight however so maybe he reduced it for complexity clusteredPeptides <- df_meta %>% dplyr::arrange(ClusterID) %>% dplyr::mutate(ClusterID=as.character(ClusterID)) %>% data.table::as.data.table() %>% split(by=“ClusterID”) %>% lapply(function(d){d[[“Peptide”]]}) consensusSequence <- function(sequenceSet){ if(length(sequenceSet)==1) return(sequenceSet) sink(tempfile()) s <- msa::msaConsensusSequence(msa::msaClustalW(sequenceSet, type=“protein”), type=“Biostrings”, ambiguityMap=“X”, threshold=0.5) sink() return(s) } clusterConsensusSeqs <- sapply(clusteredPeptides, consensusSequence) clusterConsensusSeqs <- stringr::str_replace_all(clusterConsensusSeqs, stringr::fixed(“-”), “X”)
## Graph plot ### Vertex annotations colPal <- function(x){ pal <- colorRamp(append(ggsci::pal_d3()(2), “white”, after=1), space=“rgb”) cols <- pal(x) apply(cols, 1, function(x){rgb(x[1], x[2], x[3], maxColorValue=255)}) } layout=l g=graph
clusterColors <- df_meta %>% dplyr::group_by(ClusterID) %>% dplyr::summarise(ImmunogenicityColor=colPal(mean(ImmunogenicityScore))) clusterColors=clusterColors %>% mutate(ID=readr::parse_number(ClusterID))%>% arrange((ID)) %>% select(!ID)
clusterColors <- scales::alpha(clusterColors$"ImmunogenicityColor", alpha=0.75)
vertexColors <- colPal(df_meta$"ImmunogenicityScore")
if(is.null(df_meta$"Immunogenicity")){
vertexShapes <- "circle"
}else{
vertexShapes <- dplyr::if_else(df_meta$"Immunogenicity"=="Positive", "circle", "square")
}
vertexLabels <- igraph::V(g)$"label"
### Cluster labels clusterCentroids <- df_meta %>% dplyr::group_by(ClusterID) %>% dplyr::summarise(x=mean(x), y=mean(y)) clusterCentroids\(x <- scales::rescale(clusterCentroids\)x, to=c(-1, 1)) clusterCentroids\(y <- scales::rescale(clusterCentroids\)y, to=c(-1, 1))
set.seed(seed)
pdf(paste0(FILEPATH,"/",PEPTIDE,"/Plot.pdf"),width = 10,height=10)
plot(
igraph::cluster_walktrap(g), g,
layout=layout,
col=vertexColors,
mark.border="black",
mark.col=clusterColors,
vertex.size=3,
vertex.shape=vertexShapes,
vertex.label=vertexLabels,
vertex.label.color="black",
vertex.label.cex=1.25,
vertex.label.dist=0.5,
vertex.label.family="sans",
vertex.color=vertexColors,
vertex.frame.color="black",
edge.width=0.5,
edge.arrow.size=0.25,
edge.arrow.width=0.1,
edge.color=scales::alpha("gray50", 0.75)
)
for(i in 1:length(clusterCentroids$ClusterID)){
CLUSTER_IDS=data.table(CLUSTERID=paste0("Cluster",stringr::str_locate(clusterConsensusSeqs[[i]], "X")[1]))
CLUSTER_IDS=CLUSTER_IDS %>% mutate(CLUSTERID = replace(CLUSTERID, grepl("NA",CLUSTERID), ""))
text(
rep(clusterCentroids$x[[i]], 2),
clusterCentroids$y[[i]]+c(0.02, -0.1),
c(CLUSTER_IDS$CLUSTERID, clusterConsensusSeqs[[i]]),
pos=3, cex=1.25, family="sans"
)
}
dev.off()
gc();gc()
}
# What are most escape prone epitopes?
## V1
- Take mean of pMHC for each WT-MT combo.
- Then calculate log ratio and visualise mean+/se
- This version was replaced as we're using the mean of a set of sample means here. Not accurate as different MT bind diffrent #s of MHC.
-
```{,dpi=300, fig.width = 12}
EPITOPE_ESCAPE_AVG=FULL_PREDICTIONS_DT %>% group_by(Peptide, AASeq2) %>% # For each WT-MT peptide combination.
dplyr::summarise(meanMTScore=mean(Prediction), meanWuhanScore=mean(WuhanScore))%>% # Average the MT scores and the Wuhan
mutate(logsOdds = log(meanMTScore/meanWuhanScore)) %>% # Take the logs odds
mutate(OmicronAASEQ2 = ifelse(AASeq2 %in% MUTATIONS$VariantAlignment, TRUE, FALSE))
# Summarise the data per epitope. Generate one mean logs odds across each epitope. Maybe rethink this. mean of a mean score
MUTS_DT=summarySE(EPITOPE_ESCAPE_AVG, measurevar="logsOdds", groupvars=c("Peptide"))
ESCAPE_POT_PER_PEPTIDE_PLT=MUTS_DT%>%
ggplot(aes(x=reorder(Peptide, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Peptide")+ylab("Logs Odds ratio (MT/WT)")
ESCAPE_POT_PER_PEPTIDE_PLT
saveRDS(ESCAPE_POT_PER_PEPTIDE_PLT,file="ESCAPE_POT_PER_PEPTIDE_PLT.rds")
FULL_PREDICTIONS_DT %>% filter(Peptide %in% 'NGVEGFNCY')
EPITOPE_ESCAPE_AVG=FULL_PREDICTIONS_DT %>% mutate(Score = WuhanScore-Prediction)
# Summarise the data per epitope. Generate one mean logs odds across each epitope. Maybe rethink this. mean of a mean score
MUTS_DT=summarySE(EPITOPE_ESCAPE_AVG, measurevar="Score", groupvars=c("Peptide"))
ESCAPE_POT_PER_PEPTIDE_PLT=MUTS_DT%>%
ggplot(aes(x=reorder(Peptide, -Score), y=Score))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=Score-se, ymax=Score+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab("Peptide")+ylab("Avg change in immunogenicity")
ESCAPE_POT_PER_PEPTIDE_PLT
colPal <- function(x){
pal <- colorRamp(append(ggsci::pal_d3()(2), "white", after=1), space="rgb")
cols <- pal(x)
apply(cols, 1, function(x){rgb(x[1], x[2], x[3], maxColorValue=255)})
}
WTMeans_DT=FULL_PREDICTIONS_DT %>% filter(Binder== "BINDER")%>% select(Peptide, Predicted_Binding,WuhanScore)%>% distinct()%>%group_by(Peptide)%>%
dplyr::summarise(WTMean = mean(WuhanScore))
# Summarise the data per epitope. So for each epitope, average the logs odds.
MUTS_DT=summarySE(FULL_PREDICTIONS_DT, measurevar="logsOdds", groupvars=c("Peptide"))
MUTS_DT=MUTS_DT%>% inner_join(WTMeans_DT)
## Joining, by = "Peptide"
MUTS_DT$ImmunogenicityColor=colPal(MUTS_DT$WTMean)
MUTS_DT$ImmunogenicityColor = scales::alpha(MUTS_DT$ImmunogenicityColor, alpha=0.75)
MUTS_DT=MUTS_DT %>% mutate(logsOdds=logsOdds * -1)
MUTS_DT%>% select(Peptide, logsOdds, se)%>% dplyr::rename(EscapeScore=logsOdds) %>%
readr::write_csv(file="/Users/paulbuckley/Nexus365/WIMM CCB - Koohy Group - Documents/Koohy Group/Effect_Mutation_Tcells/V11/WORKING_VERSION/Supplementary Tables/Table5_EscapeScores.csv")
ESCAPE_POT_PER_PEPTIDE_PLT=MUTS_DT %>%
ggplot(aes(x=reorder(Peptide, -logsOdds), y=logsOdds))+geom_bar(stat = "identity")+
geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour="black", width=.5, # Width of the error bars
position=position_dodge(.9))+theme_pubr(base_size = 16)+rotate_x_text()+xlab("Peptide")+
ylab("Escape Score")#+ scale_fill_identity()+
#labs(fill="WT Immunogenicity")#+ theme(panel.background = element_rect(fill = '#E3E3E3'))
ESCAPE_POT_PER_PEPTIDE_PLT
saveRDS(ESCAPE_POT_PER_PEPTIDE_PLT,file="ESCAPE_POT_PER_PEPTIDE_PLT.rds")
EPITOPE_ESCAPE_AVG=FULL_PREDICTIONS_DT %>% mutate(PMHClogsOdds = log(Prediction/WuhanScore)) %>% group_by(Peptide, AASeq2) %>% dplyr::summarise(logsOdds=mean(PMHClogsOdds)) # Average the MT scores and the Wuhan
MUTS_DT=summarySE(EPITOPE_ESCAPE_AVG, measurevar=“logsOdds”, groupvars=c(“Peptide”))
ESCAPE_POT_PER_PEPTIDE_PLT=MUTS_DT%>% ggplot(aes(x=reorder(Peptide, -logsOdds), y=logsOdds))+geom_bar(stat = “identity”)+ geom_errorbar(aes(ymin=logsOdds-se, ymax=logsOdds+se), colour=“black”, width=.5, # Width of the error bars position=position_dodge(.9))+theme_pubr(base_size = 12)+rotate_x_text()+xlab(“Peptide”)+ylab(“Logs Odds ratio (MT/WT)”) ESCAPE_POT_PER_PEPTIDE_PLT
# Does fraction hydrophobicity correlate with transition potential?
- in TCR contact vs. anchor
## EDA
- Minor correlation between hydrophobicity in anchor positions and the transition potential
```r
#CONTACT_POS=data.table(POSITIONS=c(3,4,5,6,7,8,9), Length=10)%>% rbind(
#data.table(POSITIONS=c(3,4,5,6,7,8), Length=9))%>%
HYDROPHOBIC_RESIDUES=chemistry %>% filter(group == "Hydrophobic")%>% pull(letter)
NINEMERS=MUTS_DT %>% filter(nchar(Peptide)==9) %>% mutate(TCRContact_Peptide = substr(Peptide, start=3, stop=8))
TENMERS = MUTS_DT %>% filter(nchar(Peptide)==10)%>% mutate(TCRContact_Peptide = substr(Peptide, start=3, stop=9))
MUTS_DT=NINEMERS %>% rbind(TENMERS)
MUTS_DT=MUTS_DT %>% mutate(TCR_HydrophobicCount = stringi::stri_count_regex(MUTS_DT$TCRContact_Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>%
mutate(TCR_HydroFraction = TCR_HydrophobicCount/nchar(TCRContact_Peptide))%>% select(!TCR_HydrophobicCount)
MUTS_DT %>% ggscatter(x="logsOdds",y="TCR_HydroFraction", add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE) # Add correlation coefficient. see ?stat_cor)
## `geom_smooth()` using formula 'y ~ x'
MUTS_DT=MUTS_DT %>% mutate(AnchorRes = stringr::str_remove(Peptide, TCRContact_Peptide))
MUTS_DT=MUTS_DT %>% mutate(Anchor_HydrophobicCount = stringi::stri_count_regex(MUTS_DT$AnchorRes, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>%
mutate(Anchor_HydroFraction = Anchor_HydrophobicCount/nchar(AnchorRes))%>% select(!Anchor_HydrophobicCount)
MUTS_DT %>% ggscatter(x="logsOdds",y="Anchor_HydroFraction", add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE) # Add correlation coefficient. see ?stat_cor)
## `geom_smooth()` using formula 'y ~ x'
Trying to identify a link between features and outcome. I.E hydrophobicity in PX vs. Transition potential.
To do this we can analyse by XGBOOST. Robust to correlations in the data
First, create a DT with: Peptide, P1, P2, P3, P4. Each PX entry contains AA in that position.
dummy contrast encode the data, based on hydrophobicity or not in p1, p2, p3, p4.
Build a model and Use feature importance to determine any ;link between features and logs outs ### Start w/ 9mers: XGBOOST on Hydrophbic amino acid in PX. #### Linear regression using continuous target
Catgeorical variables binary encoded (dummy contrasted).
Predicting logs odds: using a linear regression
Result: Perhap hydrophobicity in P7 is related?
# Predict 9mers: logsOdds by categorical variables
library(Matrix)
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
#stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = "")
NINEMERS_AA_PX_DT=data.table(Peptide=NINEMERS$Peptide, logsOdds=NINEMERS$logsOdds) %>% cbind(stringr::str_split_fixed(NINEMERS$Peptide,n=9,pattern = ""))
#NINEMERS_AA_PX_DT %>% mutate(across(starts_with("V"), ~ifelse(.x=="A", "Hydrophobic","!Hydrophobic")))
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT%>% as.data.table()%>%
mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))
NINEMERS_AA_PX_MATRIX = NINEMERS_AA_PX_DT %>% select(!Peptide)
sparse_matrix = sparse.model.matrix(logsOdds ~ ., data = NINEMERS_AA_PX_MATRIX)[,-1]
bst = xgboost(data=sparse_matrix, label = NINEMERS_AA_PX_DT$logsOdds, eta=0.1, objective="reg:linear",max_depth=4,nthread=2, nrounds=15)
## [15:06:32] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1] train-rmse:0.692266
## [2] train-rmse:0.660792
## [3] train-rmse:0.630174
## [4] train-rmse:0.603919
## [5] train-rmse:0.580132
## [6] train-rmse:0.556293
## [7] train-rmse:0.536487
## [8] train-rmse:0.516462
## [9] train-rmse:0.497865
## [10] train-rmse:0.479217
## [11] train-rmse:0.465259
## [12] train-rmse:0.451268
## [13] train-rmse:0.438826
## [14] train-rmse:0.426022
## [15] train-rmse:0.415385
importance = xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importance
## Feature Gain Cover Frequency
## 1: V6Hydro 0.26323265 0.12313433 0.10731707
## 2: V7Hydro 0.24733192 0.09660033 0.13658537
## 3: V9Hydro 0.15385077 0.25497512 0.07317073
## 4: V1Hydro 0.09814379 0.10945274 0.13170732
## 5: V2Hydro 0.06675875 0.05928690 0.17073171
## 6: V5Hydro 0.05408275 0.10530680 0.10243902
## 7: V3Hydro 0.05207463 0.09991708 0.16097561
## 8: V8Hydro 0.03530849 0.04436153 0.06341463
## 9: V4Hydro 0.02921626 0.10696517 0.05365854
xgb.plot.importance(importance_matrix = importance)
# Predict 9mers: binary outcome binary input
#stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = "")
NINEMERS_AA_PX_DT=data.table(Peptide=NINEMERS$Peptide, logsOdds=NINEMERS$logsOdds) %>% cbind(stringr::str_split_fixed(NINEMERS$Peptide,n=9,pattern = ""))
#NINEMERS_AA_PX_DT %>% mutate(across(starts_with("V"), ~ifelse(.x=="A", "Hydrophobic","!Hydrophobic")))
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT%>% as.data.table()%>%
mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))
NINEMERS_AA_PX_MATRIX = NINEMERS_AA_PX_DT %>% select(!Peptide)
sparse_matrix = sparse.model.matrix(logsOdds ~ ., data = NINEMERS_AA_PX_MATRIX)[,-1]
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT%>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
bst = xgboost(data=sparse_matrix, label = NINEMERS_AA_PX_DT$BINARY_OUT, eta=1, objective="binary:logistic",max_depth=4,nthread=2, nrounds=15)
## [1] train-logloss:0.436531
## [2] train-logloss:0.375321
## [3] train-logloss:0.294124
## [4] train-logloss:0.261126
## [5] train-logloss:0.243249
## [6] train-logloss:0.234259
## [7] train-logloss:0.231075
## [8] train-logloss:0.228008
## [9] train-logloss:0.221525
## [10] train-logloss:0.207540
## [11] train-logloss:0.203544
## [12] train-logloss:0.201184
## [13] train-logloss:0.199824
## [14] train-logloss:0.195798
## [15] train-logloss:0.194051
importance = xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importance
## Feature Gain Cover Frequency
## 1: V5Hydro 0.229909624 0.11152606 0.12820513
## 2: V2Hydro 0.218161826 0.09785625 0.15384615
## 3: V6Hydro 0.176017251 0.19396654 0.15384615
## 4: V3Hydro 0.115665095 0.14125067 0.10256410
## 5: V7Hydro 0.103601480 0.10456772 0.05128205
## 6: V4Hydro 0.070428880 0.07177072 0.10256410
## 7: V1Hydro 0.047210622 0.11959293 0.10256410
## 8: V9Hydro 0.032514794 0.09264624 0.12820513
## 9: V8Hydro 0.006490428 0.06682288 0.07692308
xgb.plot.importance(importance_matrix = importance)
NINEMERS_AA_PX_DT=data.table(Peptide=NINEMERS$Peptide, logsOdds=NINEMERS$logsOdds) %>% cbind(stringr::str_split_fixed(NINEMERS$Peptide,n=9,pattern = ""))
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT%>% as.data.table()%>%
mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, 1,0))
#linear_model = lm(formula = logsOdds ~ V1+V2+V3+V4+V5+V6+V7+V8+V9, data=NINEMERS_AA_PX_DT)
#summary(linear_model)
library(Boruta)
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT %>% inner_join(MUTS_DT %>% select(Peptide, Anchor_HydroFraction, TCR_HydroFraction))%>%
dplyr::rename(AnchorHydro=Anchor_HydroFraction, TCRHydro=TCR_HydroFraction)
## Joining, by = "Peptide"
NINEMERS_AA_PX_DT_FULL=NINEMERS_AA_PX_DT
# Boruta using logsOdds out
boruta_output <- Boruta(logsOdds ~ ., data=NINEMERS_AA_PX_DT%>% select(! Peptide), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## [1] meanImp decision
## <0 rows> (or 0-length row.names)
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
# Using binary output
NINEMERS_AA_PX_DT=NINEMERS_AA_PX_DT %>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
boruta_output <- Boruta(BINARY_OUT ~ ., data=NINEMERS_AA_PX_DT%>% select(! c(Peptide, logsOdds)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## meanImp decision
## V7 7.79069 Confirmed
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
library(mltools)
##
## Attaching package: 'mltools'
## The following objects are masked from 'package:yardstick':
##
## mcc, rmse
## The following object is masked from 'package:tidyr':
##
## replace_na
NINEMERS_AA_PX_DT=data.table(Peptide=NINEMERS$Peptide, logsOdds=NINEMERS$logsOdds) %>% cbind(stringr::str_split_fixed(NINEMERS$Peptide,n=9,pattern = ""))
NINEMERS_AA_ONEHOT=caret::dummyVars(" ~ . ", NINEMERS_AA_PX_DT %>% select(! c(Peptide, logsOdds)))
newdata <- data.frame(predict(NINEMERS_AA_ONEHOT, newdata = NINEMERS_AA_PX_DT %>% select(! c(Peptide, logsOdds))))
NINEMERS_AA_ONEHOT=NINEMERS_AA_PX_DT %>% select(Peptide, logsOdds)%>% cbind(newdata)
NINEMERS_AA_ONEHOT=NINEMERS_AA_ONEHOT %>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
colnames(NINEMERS_AA_ONEHOT) = gsub("V","",colnames(NINEMERS_AA_ONEHOT))
NINEMERS_AA_PX_DT_FULL=NINEMERS_AA_PX_DT_FULL %>% inner_join(NINEMERS_AA_ONEHOT)
## Joining, by = c("Peptide", "logsOdds")
boruta_output <- Boruta(logsOdds ~ ., data=NINEMERS_AA_ONEHOT%>% select(! c(Peptide, BINARY_OUT)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## meanImp decision
## `9Y` 16.228629 Confirmed
## `4Q` 8.600995 Confirmed
## `3R` 6.859537 Confirmed
## `3A` 6.371848 Confirmed
boruta_output
## Boruta performed 79 iterations in 2.26899 secs.
## 4 attributes confirmed important: `3A`, `3R`, `4Q`, `9Y`;
## 133 attributes confirmed unimportant: `1A`, `1C`, `1F`, `1G`, `1H` and
## 128 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
boruta_output <- Boruta(BINARY_OUT ~ ., data=NINEMERS_AA_ONEHOT%>% select(! c(Peptide, logsOdds)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## [1] meanImp decision
## <0 rows> (or 0-length row.names)
boruta_output
## Boruta performed 14 iterations in 0.9822969 secs.
## No attributes deemed important.
## 137 attributes confirmed unimportant: `1A`, `1C`, `1F`, `1G`, `1H` and
## 132 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
# Simply AA composition
AACOMP_DT_NINEMERS=foreach::foreach(i=1:nrow(NINEMERS_AA_PX_DT),.combine = "rbind", .packages = c("dplyr","data.table","protr"))%do% {
PEPTIDE = NINEMERS_AA_PX_DT %>% filter(row_number()==i) %>% pull(Peptide)
AA_COMP = protr::extractAAC(PEPTIDE)
DT=NINEMERS_AA_PX_DT %>% filter(row_number()==i) %>% select(Peptide, logsOdds)
AA_COMP_DT= t(AA_COMP) %>% tibble::as.tibble()
DT %>% cbind(AA_COMP_DT)
}
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
boruta_output <- Boruta(logsOdds ~ ., data=AACOMP_DT_NINEMERS%>% select(! c(Peptide)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## meanImp decision
## N 6.089681 Confirmed
boruta_output
## Boruta performed 45 iterations in 0.841604 secs.
## 1 attributes confirmed important: N;
## 19 attributes confirmed unimportant: A, C, D, E, F and 14 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
linear_model = lm(formula = logsOdds ~ G+S+T+Y+C+N+Q+K+R+H+D+E+P+A+W+F+L+I+M+V, data=AACOMP_DT_NINEMERS)
summary(linear_model)
##
## Call:
## lm(formula = logsOdds ~ G + S + T + Y + C + N + Q + K + R + H +
## D + E + P + A + W + F + L + I + M + V, data = AACOMP_DT_NINEMERS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8148 -0.3108 0.0427 0.2759 1.0856
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.57414 1.47689 1.743 0.0960 .
## G -5.51339 1.99350 -2.766 0.0116 *
## S -2.62162 1.77830 -1.474 0.1553
## T 2.88371 2.77491 1.039 0.3105
## Y -2.63400 2.72076 -0.968 0.3440
## C -3.98467 3.30443 -1.206 0.2413
## N -4.36494 2.04596 -2.133 0.0448 *
## Q -1.06714 2.38385 -0.448 0.6590
## K 2.20786 1.92854 1.145 0.2652
## R -3.13333 2.32120 -1.350 0.1914
## H -3.02713 4.13155 -0.733 0.4718
## D -5.90712 3.09920 -1.906 0.0704 .
## E -8.08605 3.06219 -2.641 0.0153 *
## P -4.04966 2.09788 -1.930 0.0672 .
## A -1.68112 2.63206 -0.639 0.5299
## W -9.24883 3.58652 -2.579 0.0175 *
## F -2.47352 2.11475 -1.170 0.2552
## L -0.03142 2.47173 -0.013 0.9900
## I -1.31770 2.64432 -0.498 0.6234
## M 1.84980 5.28006 0.350 0.7296
## V NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5898 on 21 degrees of freedom
## Multiple R-squared: 0.6401, Adjusted R-squared: 0.3145
## F-statistic: 1.966 on 19 and 21 DF, p-value: 0.06776
# cmobine everything
NINEMERS_AA_PX_DT_FULL=NINEMERS_AA_PX_DT_FULL %>% inner_join(AACOMP_DT_NINEMERS)
## Joining, by = c("Peptide", "logsOdds")
boruta_output <- Boruta(logsOdds ~ ., data=NINEMERS_AA_PX_DT_FULL%>% select(! c(Peptide, BINARY_OUT)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## meanImp decision
## `9Y` 11.954615 Confirmed
## `4Q` 9.298192 Confirmed
## `3A` 6.358678 Confirmed
boruta_output
## Boruta performed 99 iterations in 3.308444 secs.
## 3 attributes confirmed important: `3A`, `4Q`, `9Y`;
## 163 attributes confirmed unimportant: A, AnchorHydro, C, D, E and 158
## more;
## 2 tentative attributes left: N, `8L`;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
TENMERS_AA_PX_DT=data.table(Peptide=TENMERS$Peptide, logsOdds=TENMERS$logsOdds) %>% cbind(stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = ""))
#NINEMERS_AA_PX_DT %>% mutate(across(starts_with("V"), ~ifelse(.x=="A", "Hydrophobic","!Hydrophobic")))
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT%>% as.data.table()%>%
mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))%>%
mutate(V10 = ifelse(V10 %in% HYDROPHOBIC_RESIDUES, "Hydro","!Hydro"))
TENMERS_AA_PX_MATRIX = TENMERS_AA_PX_DT %>% select(!Peptide)
sparse_matrix = sparse.model.matrix(logsOdds ~ ., data = TENMERS_AA_PX_MATRIX)[,-1]
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT%>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
bst = xgboost(data=sparse_matrix, label = TENMERS_AA_PX_DT$BINARY_OUT, eta=1, objective="binary:logistic",max_depth=4,nthread=2, nrounds=15)
## [1] train-logloss:0.553929
## [2] train-logloss:0.493942
## [3] train-logloss:0.464173
## [4] train-logloss:0.439194
## [5] train-logloss:0.425441
## [6] train-logloss:0.414300
## [7] train-logloss:0.404312
## [8] train-logloss:0.393396
## [9] train-logloss:0.385126
## [10] train-logloss:0.370233
## [11] train-logloss:0.363904
## [12] train-logloss:0.347948
## [13] train-logloss:0.344436
## [14] train-logloss:0.338114
## [15] train-logloss:0.335126
importance = xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
importance
## Feature Gain Cover Frequency
## 1: V5Hydro 0.216322121 0.13842264 0.16129032
## 2: V1Hydro 0.208311883 0.17971085 0.16129032
## 3: V3Hydro 0.184389950 0.22137734 0.19354839
## 4: V6Hydro 0.141222662 0.12043871 0.09677419
## 5: V7Hydro 0.126480483 0.12675823 0.16129032
## 6: V8Hydro 0.052622959 0.04381815 0.03225806
## 7: V2Hydro 0.049335904 0.08890103 0.09677419
## 8: V10Hydro 0.017674372 0.04644848 0.06451613
## 9: V4Hydro 0.003639666 0.03412455 0.03225806
xgb.plot.importance(importance_matrix = importance)
TENMERS_AA_PX_DT=data.table(Peptide=TENMERS$Peptide, logsOdds=TENMERS$logsOdds) %>% cbind(stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = ""))
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT%>% as.data.table()%>%
mutate(V1 = ifelse(V1 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V2 = ifelse(V2 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V3 = ifelse(V3 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V4 = ifelse(V4 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V5 = ifelse(V5 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V6 = ifelse(V6 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V7 = ifelse(V7 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V8 = ifelse(V8 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V9 = ifelse(V9 %in% HYDROPHOBIC_RESIDUES, 1,0))%>%
mutate(V10 = ifelse(V10 %in% HYDROPHOBIC_RESIDUES, 1,0))
#linear_model = lm(formula = logsOdds ~ V1+V2+V3+V4+V5+V6+V7+V8+V9, data=NINEMERS_AA_PX_DT)
#summary(linear_model)
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT %>% inner_join(MUTS_DT %>% select(Peptide, Anchor_HydroFraction, TCR_HydroFraction))%>%
dplyr::rename(AnchorHydro=Anchor_HydroFraction, TCRHydro=TCR_HydroFraction)
## Joining, by = "Peptide"
TENMERS_AA_PX_DT_FULL=TENMERS_AA_PX_DT
# Boruta using logsOdds out
boruta_output <- Boruta(logsOdds ~ ., data=TENMERS_AA_PX_DT%>% select(! Peptide), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## [1] meanImp decision
## <0 rows> (or 0-length row.names)
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
# Using binary output
TENMERS_AA_PX_DT=TENMERS_AA_PX_DT %>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
boruta_output <- Boruta(BINARY_OUT ~ ., data=TENMERS_AA_PX_DT%>% select(! c(Peptide, logsOdds)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
## Warning in TentativeRoughFix(boruta_output): There are no Tentative attributes!
## Returning original object.
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## [1] meanImp decision
## <0 rows> (or 0-length row.names)
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
TENMERS_AA_PX_DT=data.table(Peptide=TENMERS$Peptide, logsOdds=TENMERS$logsOdds) %>% cbind(stringr::str_split_fixed(TENMERS$Peptide,n=10,pattern = ""))
TENMERS_AA_ONEHOT=caret::dummyVars(" ~ . ", TENMERS_AA_PX_DT %>% select(! c(Peptide, logsOdds)))
newdata <- data.frame(predict(TENMERS_AA_ONEHOT, newdata = TENMERS_AA_PX_DT %>% select(! c(Peptide, logsOdds))))
TENMERS_AA_ONEHOT=TENMERS_AA_PX_DT %>% select(Peptide, logsOdds)%>% cbind(newdata)
TENMERS_AA_ONEHOT=TENMERS_AA_ONEHOT %>% mutate(BINARY_OUT = ifelse(logsOdds>0,1,0))
colnames(TENMERS_AA_ONEHOT) = gsub("V","",colnames(TENMERS_AA_ONEHOT))
TENMERS_AA_PX_DT_FULL=TENMERS_AA_PX_DT_FULL %>% inner_join(TENMERS_AA_ONEHOT)
## Joining, by = c("Peptide", "logsOdds")
boruta_output <- Boruta(logsOdds ~ ., data=TENMERS_AA_ONEHOT%>% select(! c(Peptide, BINARY_OUT)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## meanImp decision
## `3` 6.713622 Confirmed
boruta_output
## Boruta performed 99 iterations in 3.086757 secs.
## 1 attributes confirmed important: `3`;
## 125 attributes confirmed unimportant: `10F`, `10I`, `10K`, `10L`,
## `10R` and 120 more;
## 7 tentative attributes left: `5K`, `5S`, `7G`, `7L`, `7S` and 2 more;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
boruta_output <- Boruta(BINARY_OUT ~ ., data=TENMERS_AA_ONEHOT%>% select(! c(Peptide, logsOdds)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## meanImp decision
## `7G` 12.337362 Confirmed
## `1F` 9.320999 Confirmed
boruta_output
## Boruta performed 99 iterations in 2.760607 secs.
## 2 attributes confirmed important: `1F`, `7G`;
## 130 attributes confirmed unimportant: `10F`, `10I`, `10K`, `10L`,
## `10R` and 125 more;
## 1 tentative attributes left: `1Q`;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
# Simply AA composition
AACOMP_DT_TENMERS=foreach::foreach(i=1:nrow(TENMERS_AA_PX_DT),.combine = "rbind", .packages = c("dplyr","data.table","protr"))%do% {
PEPTIDE = TENMERS_AA_PX_DT %>% filter(row_number()==i) %>% pull(Peptide)
AA_COMP = protr::extractAAC(PEPTIDE)
DT=TENMERS_AA_PX_DT %>% filter(row_number()==i) %>% select(Peptide, logsOdds)
AA_COMP_DT= t(AA_COMP) %>% tibble::as.tibble()
DT %>% cbind(AA_COMP_DT)
}
boruta_output <- Boruta(logsOdds ~ ., data=AACOMP_DT_TENMERS%>% select(! c(Peptide)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## meanImp decision
## N 3.304996 Confirmed
## S 2.682743 Confirmed
boruta_output
## Boruta performed 99 iterations in 2.019348 secs.
## No attributes deemed important.
## 16 attributes confirmed unimportant: C, D, E, F, H and 11 more;
## 4 tentative attributes left: A, G, N, S;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
linear_model = lm(formula = logsOdds ~ G+S+T+Y+C+N+Q+K+R+H+D+E+P+A+W+F+L+I+M+V, data=AACOMP_DT_TENMERS)
summary(linear_model)
##
## Call:
## lm(formula = logsOdds ~ G + S + T + Y + C + N + Q + K + R + H +
## D + E + P + A + W + F + L + I + M + V, data = AACOMP_DT_TENMERS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58330 -0.14425 -0.03811 0.22040 0.40244
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5761 2.3214 -0.248 0.811
## G 1.0597 4.6133 0.230 0.825
## S 2.5072 2.7670 0.906 0.395
## T -2.3503 3.0694 -0.766 0.469
## Y 3.3146 3.7571 0.882 0.407
## C -2.9864 3.9186 -0.762 0.471
## N -5.0087 2.7558 -1.818 0.112
## Q -0.8728 2.4530 -0.356 0.732
## K 1.6939 2.5504 0.664 0.528
## R 4.6325 3.6065 1.284 0.240
## H 6.9650 6.6889 1.041 0.332
## D 1.9650 3.9476 0.498 0.634
## E -1.6340 3.4608 -0.472 0.651
## P 2.5032 4.8145 0.520 0.619
## A 3.8156 3.6678 1.040 0.333
## W 5.4173 5.2457 1.033 0.336
## F 1.8980 2.6660 0.712 0.500
## L -0.4852 3.3743 -0.144 0.890
## I -0.2274 2.8460 -0.080 0.939
## M NA NA NA NA
## V NA NA NA NA
##
## Residual standard error: 0.4593 on 7 degrees of freedom
## Multiple R-squared: 0.8026, Adjusted R-squared: 0.2949
## F-statistic: 1.581 on 18 and 7 DF, p-value: 0.2766
# cmobine everything
TENMERS_AA_PX_DT_FULL=TENMERS_AA_PX_DT_FULL %>% inner_join(AACOMP_DT_TENMERS)
## Joining, by = c("Peptide", "logsOdds")
boruta_output <- Boruta(logsOdds ~ ., data=TENMERS_AA_PX_DT_FULL%>% select(! c(Peptide, BINARY_OUT)), doTrace=0)
roughFixMod <- TentativeRoughFix(boruta_output)
imps <- attStats(roughFixMod)
imps2 = imps[imps$decision != 'Rejected', c('meanImp', 'decision')]
head(imps2[order(-imps2$meanImp), ]) # descending sort
## meanImp decision
## `7L` 7.212546 Confirmed
boruta_output
## Boruta performed 99 iterations in 3.222649 secs.
## 1 attributes confirmed important: `7L`;
## 159 attributes confirmed unimportant: AnchorHydro, C, D, E, F and 154
## more;
## 5 tentative attributes left: A, `3L`, `3`, `5K`, `7G`;
plot(boruta_output, cex.axis=.7, las=2, xlab="", main="Variable Importance")
chemistry %>% DT::datatable()
#EPITOPE_ESCAPE_AVG=FULL_PREDICTIONS_DT %>% mutate(Score = WuhanScore-Prediction)
# Summarise the data per epitope. Generate one mean logs odds across each epitope. Maybe rethink this. mean of a mean score
#MUTS_DT=summarySE(EPITOPE_ESCAPE_AVG, measurevar="Score", groupvars=c("Peptide"))
HIGH_SET = MUTS_DT%>% slice_max(order_by = logsOdds, n=9) %>% mutate(Dataset = "High")
LOW_SET = MUTS_DT%>% slice_min(order_by = logsOdds, n=9) %>% mutate(Dataset = "Negative")
LOW_HIGH_DT= LOW_SET %>% rbind(HIGH_SET)
my_comparisons = list(c("High","Negative"))
REMAINING_CHEMISTRY=chemistry #%>% filter(! group == "Hydrophobic")
#REMAINING_CHEMISTRY_MUTS_DT=foreach::foreach(i=1:length(unique(REMAINING_CHEMISTRY$group)), .combine = "rbind", .packages = c("dplyr","data.table"))%do% {
# CHEMISTRY_GRP = unique(REMAINING_CHEMISTRY$group)[i]
# CHEMISTRY_AAS = REMAINING_CHEMISTRY %>% filter(group == CHEMISTRY_GRP)%>% pull(letter)
# MUTS_DT %>% select(Peptide,logsOdds, TCRContact_Peptide, AnchorRes)%>%
# mutate(Anchor_Count = stringi::stri_count_regex(MUTS_DT$AnchorRes, paste0(CHEMISTRY_AAS,collapse = "|"))) %>%
# mutate(Anchor_Fraction = Anchor_Count/nchar(AnchorRes))%>% select(!Anchor_Count)%>% mutate(group=CHEMISTRY_GRP)%>%
# mutate(TCRContact_Count = stringi::stri_count_regex(MUTS_DT$TCRContact_Peptide, paste0(CHEMISTRY_AAS,collapse = "|"))) %>%
# mutate(TCRContact_Fraction = TCRContact_Count/nchar(TCRContact_Peptide))%>% select(!TCRContact_Count)%>% mutate(group=CHEMISTRY_GRP)
#}
REMAINING_CHEMISTRY_MUTS_DT=foreach::foreach(i=1:length(unique(REMAINING_CHEMISTRY$group)), .combine = "rbind", .packages = c("dplyr","data.table"))%do% {
CHEMISTRY_GRP = unique(REMAINING_CHEMISTRY$group)[i]
CHEMISTRY_AAS = REMAINING_CHEMISTRY %>% filter(group == CHEMISTRY_GRP)%>% pull(letter)
MUTS_DT %>% select(Peptide,logsOdds, TCRContact_Peptide, AnchorRes)%>%
mutate(Anchor_Count = stringi::stri_count_regex(MUTS_DT$AnchorRes, paste0(CHEMISTRY_AAS,collapse = "|"))) %>%
mutate(Anchor_Fraction = Anchor_Count/nchar(AnchorRes))%>% select(!Anchor_Count)%>% mutate(group=CHEMISTRY_GRP)%>%
mutate(TCRContact_Count = stringi::stri_count_regex(MUTS_DT$TCRContact_Peptide, paste0(CHEMISTRY_AAS,collapse = "|"))) %>%
mutate(TCRContact_Fraction = TCRContact_Count/nchar(TCRContact_Peptide))%>% select(!TCRContact_Count)%>% mutate(group=CHEMISTRY_GRP)
}
LOW_SET = REMAINING_CHEMISTRY_MUTS_DT%>% filter(Peptide %in% LOW_SET$Peptide)%>% mutate(Dataset = "Negative")#
HIGH_SET = REMAINING_CHEMISTRY_MUTS_DT%>% filter(Peptide %in% HIGH_SET$Peptide)%>% mutate(Dataset = "High")
TOLERANT_SET = REMAINING_CHEMISTRY_MUTS_DT%>% filter(Peptide %in% c("TLACFVLAAV","YYHKNNKSW","FGEVFNATRF","QTGKIADYNY","QWNLVIGFLF","GEVFNATRF","FQPTNGVGY",
"SYQTQTNSPR","DSKVGGNYNY"))%>% mutate(Dataset = "Negligible")
#TOLERANT_SET = REMAINING_CHEMISTRY_MUTS_DT%>% filter(Peptide %in% c("GEVFNATRF","SYQTQTNSPR","FGEVFNATRF","CNDPFLGVYY","HVSGTNGTK"))%>% mutate(Dataset = "Negligible")
LOW_HIGH_DT= LOW_SET %>% rbind(HIGH_SET)%>% rbind(TOLERANT_SET)%>% mutate(Dataset = factor(Dataset, levels = c("High","Negligible","Negative")))
#ENHANCED_IMP_DT %>%
# ggplot(aes(x=Dataset, y=Anchor_Fraction, fill=Dataset))+
# geom_dotplot(binaxis="y", stackdir="center",stackratio=1,dotsize = 0.7,binpositions = "all")+
# theme_pubr(base_size = 16) +
# facet_grid(~group)+
# stat_summary(fun=mean, geom="crossbar", width=0.7) +
# stat_compare_means(comparisons = my_comparisons, label="p.signif")
HYDROPHOBICITY_ANCHORFREQ_PLT=LOW_HIGH_DT %>% filter(group == "Hydrophobic") %>%
ggviolin(x="Dataset",y="Anchor_Fraction",color="Dataset", add=c("boxplot","jitter"),trim=TRUE,palette = "jco")+
stat_compare_means(comparisons = my_comparisons, label="p.signif",tip.length = 0.05,label.y=1.09)+
theme_pubr(base_size = 16)+ylab("Freq. of hydrophobic residues") +
facet_grid(~group)
HYDROPHOBICITY_ANCHORFREQ_PLT
#HYDROPHOBICITY_ANCHORFREQ_PLT=LOW_HIGH_DT %>% filter(group == "Hydrophobic") %>%
# ggboxplot(x="Dataset",y="Anchor_Fraction",color="Dataset", add=c("jitter"),palette = "jco")+
# stat_compare_means(comparisons = my_comparisons, label="p.signif",tip.length = 0.05,label.y=1.05)+
# theme_pubr(base_size = 16)+ylab("Freq. of hydrophobic residues") +
# facet_grid(~group)
#HYDROPHOBICITY_ANCHORFREQ_PLT
#HYDROPHOBICITY_ANCHORFREQ_PLT=LOW_HIGH_DT %>% filter(group == "Hydrophobic") %>%
# ggplot(aes(x=Dataset, y=Anchor_Fraction, color=Dataset))+geom_violin()+geom_jitter()+
# stat_compare_means(comparisons = my_comparisons, label="p.signif",tip.length = 0.05,label.y=1.09)+
# theme_pubr(base_size = 16)+ylab("Freq. of hydrophobic residues") +
# facet_grid(~group)
#HYDROPHOBICITY_ANCHORFREQ_PLT
saveRDS(HYDROPHOBICITY_ANCHORFREQ_PLT,file="HYDROPHOBICITY_ANCHORFREQ_PLT.rds")
#ENHANCED_IMP_DT %>%
# ggviolin(x="Dataset",y="TCRContact_Fraction",color="Dataset", add=c("boxplot","jitter"),palette = "jco")+
# stat_compare_means(comparisons = my_comparisons, label="p.signif",label.y = 1.6,label.sep = 0.2)+
# theme_pubr(base_size = 16) +
# facet_grid(~group)
HLA_TRANSITIONPOT_GRPS_PLT=FULL_PREDICTIONS_DT %>% inner_join(LOW_HIGH_DT %>% select(Peptide, Dataset) %>% distinct())%>%
group_by(Peptide, Dataset)%>%filter(Binder == "BINDER")%>%
dplyr::summarise(n=n_distinct(Predicted_Binding))%>%
ggboxplot(x="Dataset",y="n",add="jitter",color="Dataset",palette = "jco")+theme_pubr(base_size = 16)+stat_compare_means(comparisons = my_comparisons,label="p.signif")+ylab("# HLAs bound")
## Joining, by = "Peptide"
## `summarise()` has grouped output by 'Peptide'. You can override using the `.groups` argument.
HLA_TRANSITIONPOT_GRPS_PLT
saveRDS(HLA_TRANSITIONPOT_GRPS_PLT,file="HLA_TRANSITIONPOT_GRPS_PLT.rds")
NINE_LOW_HIGH_DT = LOW_HIGH_DT%>% select(Peptide, Dataset)%>% distinct() %>% filter(nchar(Peptide)==9)
NINE_LOW_HIGH_DT=NINE_LOW_HIGH_DT%>% cbind(stringr::str_split_fixed(NINE_LOW_HIGH_DT$Peptide,n=9,pattern = ""))
TEN_LOW_HIGH_DT = LOW_HIGH_DT%>% select(Peptide, Dataset)%>% distinct() %>% filter(nchar(Peptide)==10)
TEN_LOW_HIGH_DT=TEN_LOW_HIGH_DT%>% cbind(stringr::str_split_fixed(TEN_LOW_HIGH_DT$Peptide,n=10,pattern = ""))
#NINE_ENHANCED_IMP_DT%>% mutate("10"="-")%>% rbind(TEN_ENHANCED_IMP_DT)%>%
# pivot_longer(cols = `1`:`10`)%>%dplyr::rename(letter=value,position=name)%>%
# inner_join(chemistry)%>%filter(position %in% c(1,2,9,10))%>%
#
#ggplot(aes(x=Peptide, y=letter, color=group))+geom_point()+facet_wrap(~position+Dataset,scales="free_x",nrow=4)+rotate_x_text()
NINEMER_AA_BACKGROUND=REMAINING_CHEMISTRY_MUTS_DT %>% select(Peptide)%>% distinct() %>% filter(nchar(Peptide)==9)
NINEMER_AA_BACKGROUND=NINEMER_AA_BACKGROUND%>% cbind(stringr::str_split_fixed(NINEMER_AA_BACKGROUND$Peptide,n=9,pattern = ""))
TENMER_AA_BACKGROUND=REMAINING_CHEMISTRY_MUTS_DT %>% select(Peptide)%>% distinct() %>% filter(nchar(Peptide)==10)
TENMER_AA_BACKGROUND=TENMER_AA_BACKGROUND%>% cbind(stringr::str_split_fixed(TENMER_AA_BACKGROUND$Peptide,n=10,pattern = ""))
BACKGROUND_AA_DISTR=NINEMER_AA_BACKGROUND %>% mutate("10"="-") %>% rbind(TENMER_AA_BACKGROUND)
BACKGROUND_AA_DISTR=BACKGROUND_AA_DISTR%>%
pivot_longer(cols = `1`:`10`)%>%dplyr::rename(letter=value,position=name)%>% ungroup()
SAMPLED_BACKGROUND_AA_DISTR=foreach::foreach(i=1:10, .combine = "rbind", .packages = c("dplyr","data.table"))%do% {
SAMPLED_PEPS=BACKGROUND_AA_DISTR %>% select(Peptide) %>% distinct()%>% sample_n(size=5)%>% pull(Peptide)
BACKGROUND_AA_DISTR %>% filter(Peptide %in% SAMPLED_PEPS)%>% group_by(letter)%>% dplyr::summarise(n=n())%>% mutate(ID=i)
}
SAMPLED_BACKGROUND_AA_DISTR=SAMPLED_BACKGROUND_AA_DISTR %>% mutate(Dataset = "CTRL")%>% select(!ID)%>% inner_join(chemistry %>% select(! col))
## Joining, by = "letter"
AA_COMP_TRANSITIONPOT_GRPS=NINE_LOW_HIGH_DT%>% mutate("10"="-")%>% rbind(TEN_LOW_HIGH_DT)%>%
pivot_longer(cols = `1`:`10`)%>%dplyr::rename(letter=value,position=name)%>%
inner_join(chemistry)%>% group_by(Dataset, letter, group) %>% dplyr::summarise(n=n())%>%
rbind(SAMPLED_BACKGROUND_AA_DISTR)%>%
group_by(Dataset, letter, group)%>% dplyr::summarise(meanN=mean(n), sd=sd(n))%>%
filter(group == "Hydrophobic")%>%
ggplot(aes(x=letter, y=meanN, fill=Dataset))+geom_bar(stat="identity",position=position_dodge())+
geom_errorbar(aes(ymin=meanN-sd, ymax=meanN+sd), width=.2, position=position_dodge(.9))+
facet_wrap(~group,scales="free",nrow=1)+theme_pubr(base_size = 16)+ylab("Count")+xlab("Amino Acid")
## Joining, by = "letter"
## `summarise()` has grouped output by 'Dataset', 'letter'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'Dataset', 'letter'. You can override using the `.groups` argument.
AA_COMP_TRANSITIONPOT_GRPS
saveRDS(AA_COMP_TRANSITIONPOT_GRPS,file="AA_COMP_TRANSITIONPOT_GRPS.rds")
HYDRO_AA=chemistry %>% filter(group=="Hydrophobic")%>% pull(letter)
LENGTH=9
POSITIONS = c(3,4,5,6,7,8)
MUTS_TO_ANALYSE=FULL_PREDICTIONS_DT %>% filter(ChangeTo %in% HYDRO_AA)%>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3)%>% pull(ChangeToPos)
FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
ggscatter(x="logsOdds",y="MT_MHCScore", # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
add="loess")
## `geom_smooth()` using formula 'y ~ x'
FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
ggscatter(x="logsOdds",y="TRAPP_MUTANT_Prediction", # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
add="loess")
## `geom_smooth()` using formula 'y ~ x'
LENGTH=10
POSITIONS = c(3,4,5,6,7,8,9)
MUTS_TO_ANALYSE=FULL_PREDICTIONS_DT %>% filter(ChangeTo %in% HYDRO_AA)%>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3)%>% pull(ChangeToPos)
FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
ggscatter(x="logsOdds",y="MT_MHCScore", # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
add="loess")
## `geom_smooth()` using formula 'y ~ x'
FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
ggscatter(x="logsOdds",y="TRAPP_MUTANT_Prediction", # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
add="loess")
## `geom_smooth()` using formula 'y ~ x'
HYDRO_AA=chemistry %>% filter(group=="Hydrophobic")%>% pull(letter)
LENGTH=9
POSITIONS = c(1,2,9)
MUTS_TO_ANALYSE=FULL_PREDICTIONS_DT %>% filter(ChangeTo %in% HYDRO_AA)%>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3)%>% pull(ChangeToPos)
FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
ggscatter(x="logsOdds",y="MT_MHCScore", # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
add="loess")
## `geom_smooth()` using formula 'y ~ x'
FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
ggscatter(x="logsOdds",y="TRAPP_MUTANT_Prediction", # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
add="loess")
## `geom_smooth()` using formula 'y ~ x'
LENGTH=10
POSITIONS = c(1,2,10)
MUTS_TO_ANALYSE=FULL_PREDICTIONS_DT %>% filter(ChangeTo %in% HYDRO_AA)%>% filter(SeqMutationPos %in% POSITIONS)%>% filter(Length == LENGTH)%>%
mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo))%>% group_by(ChangeToPos)%>%
dplyr::summarise(meanlogsOdds=mean(logsOdds),n=n())%>% arrange(desc(meanlogsOdds))%>% filter(! n<20)%>%
slice_max(order_by = meanlogsOdds, prop = 0.3)%>% pull(ChangeToPos)
FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
ggscatter(x="logsOdds",y="MT_MHCScore", # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
add="loess")
## `geom_smooth()` using formula 'y ~ x'
FULL_PREDICTIONS_DT %>% mutate(ChangeToPos=paste0(SeqMutationPos,"_",ChangeTo)) %>% filter(ChangeToPos%in% MUTS_TO_ANALYSE)%>%
ggscatter(x="logsOdds",y="TRAPP_MUTANT_Prediction", # Customize reg. line
conf.int = TRUE, # Add confidence interval
cor.coef = TRUE, # Add correlation coefficient. see ?stat_cor
cor.coeff.args = list(method = "pearson", label.x = -3, label.sep = "\n"),
add="loess")
## `geom_smooth()` using formula 'y ~ x'